aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2013-02-21 10:16:55 -0500
committerRalf Baechle <ralf@linux-mips.org>2013-02-22 04:07:30 -0500
commitedb15d83a875a1f4b1576188844db5c330c3267d (patch)
tree74d54eab401b6ccf2a6ad4821227108a8d160f03 /kernel
parent8bfc245f9ad7bd4e461179e4e7852ef99b8b6144 (diff)
parenta0b1c42951dd06ec83cc1bc2c9788131d9fefcd8 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into mips-for-linux-next
Conflicts: include/linux/ssb/ssb_driver_gige.h Also resolves a logical merge conflict in drivers/net/ethernet/broadcom/- bgmac.c due to change of an API.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/async.c167
-rw-r--r--kernel/cgroup.c288
-rw-r--r--kernel/context_tracking.c114
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c884
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/events/core.c5
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/uprobes.c466
-rw-r--r--kernel/exit.c10
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c1
-rw-r--r--kernel/hrtimer.c38
-rw-r--r--kernel/irq/chip.c30
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq_work.c150
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c31
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-cpu-timers.c51
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/printk.c36
-rw-r--r--kernel/profile.c24
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c60
-rw-r--r--kernel/rcutiny.c8
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h11
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c1
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c73
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c314
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c27
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/signal.c12
-rw-r--r--kernel/smpboot.c5
-rw-r--r--kernel/softirq.c23
-rw-r--r--kernel/srcu.c37
-rw-r--r--kernel/stop_machine.c156
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time.c8
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/ntp.c22
-rw-r--r--kernel/time/tick-broadcast.c38
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c45
-rw-r--r--kernel/timeconst.pl6
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig33
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c88
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c108
-rw-r--r--kernel/trace/trace.c253
-rw-r--r--kernel/trace/trace.h134
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c61
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_selftest.c21
-rw-r--r--kernel/trace/trace_syscalls.c18
-rw-r--r--kernel/trace/trace_uprobe.c217
-rw-r--r--kernel/tsacct.c44
-rw-r--r--kernel/watchdog.c1
-rw-r--r--kernel/workqueue.c1530
-rw-r--r--kernel/workqueue_internal.h65
-rw-r--r--kernel/workqueue_sched.h9
86 files changed, 3941 insertions, 2497 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..e8b1627ab9c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -566,6 +566,7 @@ out:
566void acct_collect(long exitcode, int group_dead) 566void acct_collect(long exitcode, int group_dead)
567{ 567{
568 struct pacct_struct *pacct = &current->signal->pacct; 568 struct pacct_struct *pacct = &current->signal->pacct;
569 cputime_t utime, stime;
569 unsigned long vsize = 0; 570 unsigned long vsize = 0;
570 571
571 if (group_dead && current->mm) { 572 if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
593 pacct->ac_flag |= ACORE; 594 pacct->ac_flag |= ACORE;
594 if (current->flags & PF_SIGNALED) 595 if (current->flags & PF_SIGNALED)
595 pacct->ac_flag |= AXSIG; 596 pacct->ac_flag |= AXSIG;
596 pacct->ac_utime += current->utime; 597 task_cputime(current, &utime, &stime);
597 pacct->ac_stime += current->stime; 598 pacct->ac_utime += utime;
599 pacct->ac_stime += stime;
598 pacct->ac_minflt += current->min_flt; 600 pacct->ac_minflt += current->min_flt;
599 pacct->ac_majflt += current->maj_flt; 601 pacct->ac_majflt += current->maj_flt;
600 spin_unlock_irq(&current->sighand->siglock); 602 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 6f34904a0b53..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,65 +57,52 @@ asynchronous and synchronous parts of the kernel.
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/workqueue.h> 58#include <linux/workqueue.h>
59 59
60#include "workqueue_internal.h"
61
60static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
61 63
62#define MAX_WORK 32768 64#define MAX_WORK 32768
65#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
63 66
64static LIST_HEAD(async_pending); 67static LIST_HEAD(async_global_pending); /* pending from all registered doms */
65static ASYNC_DOMAIN(async_running); 68static ASYNC_DOMAIN(async_dfl_domain);
66static LIST_HEAD(async_domains);
67static DEFINE_SPINLOCK(async_lock); 69static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
69 70
70struct async_entry { 71struct async_entry {
71 struct list_head list; 72 struct list_head domain_list;
73 struct list_head global_list;
72 struct work_struct work; 74 struct work_struct work;
73 async_cookie_t cookie; 75 async_cookie_t cookie;
74 async_func_ptr *func; 76 async_func_ptr *func;
75 void *data; 77 void *data;
76 struct async_domain *running; 78 struct async_domain *domain;
77}; 79};
78 80
79static DECLARE_WAIT_QUEUE_HEAD(async_done); 81static DECLARE_WAIT_QUEUE_HEAD(async_done);
80 82
81static atomic_t entry_count; 83static atomic_t entry_count;
82 84
83 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
84/*
85 * MUST be called with the lock held!
86 */
87static async_cookie_t __lowest_in_progress(struct async_domain *running)
88{ 86{
89 async_cookie_t first_running = next_cookie; /* infinity value */ 87 struct async_entry *first = NULL;
90 async_cookie_t first_pending = next_cookie; /* ditto */ 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
91 struct async_entry *entry; 89 unsigned long flags;
92 90
93 /* 91 spin_lock_irqsave(&async_lock, flags);
94 * Both running and pending lists are sorted but not disjoint.
95 * Take the first cookies from both and return the min.
96 */
97 if (!list_empty(&running->domain)) {
98 entry = list_first_entry(&running->domain, typeof(*entry), list);
99 first_running = entry->cookie;
100 }
101 92
102 list_for_each_entry(entry, &async_pending, list) { 93 if (domain) {
103 if (entry->running == running) { 94 if (!list_empty(&domain->pending))
104 first_pending = entry->cookie; 95 first = list_first_entry(&domain->pending,
105 break; 96 struct async_entry, domain_list);
106 } 97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
107 } 101 }
108 102
109 return min(first_running, first_pending); 103 if (first)
110} 104 ret = first->cookie;
111
112static async_cookie_t lowest_in_progress(struct async_domain *running)
113{
114 unsigned long flags;
115 async_cookie_t ret;
116 105
117 spin_lock_irqsave(&async_lock, flags);
118 ret = __lowest_in_progress(running);
119 spin_unlock_irqrestore(&async_lock, flags); 106 spin_unlock_irqrestore(&async_lock, flags);
120 return ret; 107 return ret;
121} 108}
@@ -127,20 +114,10 @@ static void async_run_entry_fn(struct work_struct *work)
127{ 114{
128 struct async_entry *entry = 115 struct async_entry *entry =
129 container_of(work, struct async_entry, work); 116 container_of(work, struct async_entry, work);
130 struct async_entry *pos;
131 unsigned long flags; 117 unsigned long flags;
132 ktime_t uninitialized_var(calltime), delta, rettime; 118 ktime_t uninitialized_var(calltime), delta, rettime;
133 struct async_domain *running = entry->running;
134 119
135 /* 1) move self to the running queue, make sure it stays sorted */ 120 /* 1) run (and print duration) */
136 spin_lock_irqsave(&async_lock, flags);
137 list_for_each_entry_reverse(pos, &running->domain, list)
138 if (entry->cookie < pos->cookie)
139 break;
140 list_move_tail(&entry->list, &pos->list);
141 spin_unlock_irqrestore(&async_lock, flags);
142
143 /* 2) run (and print duration) */
144 if (initcall_debug && system_state == SYSTEM_BOOTING) { 121 if (initcall_debug && system_state == SYSTEM_BOOTING) {
145 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 122 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
146 (long long)entry->cookie, 123 (long long)entry->cookie,
@@ -157,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
157 (long long)ktime_to_ns(delta) >> 10); 134 (long long)ktime_to_ns(delta) >> 10);
158 } 135 }
159 136
160 /* 3) remove self from the running queue */ 137 /* 2) remove self from the pending queues */
161 spin_lock_irqsave(&async_lock, flags); 138 spin_lock_irqsave(&async_lock, flags);
162 list_del(&entry->list); 139 list_del_init(&entry->domain_list);
163 if (running->registered && --running->count == 0) 140 list_del_init(&entry->global_list);
164 list_del_init(&running->node);
165 141
166 /* 4) free the entry */ 142 /* 3) free the entry */
167 kfree(entry); 143 kfree(entry);
168 atomic_dec(&entry_count); 144 atomic_dec(&entry_count);
169 145
170 spin_unlock_irqrestore(&async_lock, flags); 146 spin_unlock_irqrestore(&async_lock, flags);
171 147
172 /* 5) wake up any waiters */ 148 /* 4) wake up any waiters */
173 wake_up(&async_done); 149 wake_up(&async_done);
174} 150}
175 151
176static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) 152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
177{ 153{
178 struct async_entry *entry; 154 struct async_entry *entry;
179 unsigned long flags; 155 unsigned long flags;
@@ -196,16 +172,22 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
196 ptr(data, newcookie); 172 ptr(data, newcookie);
197 return newcookie; 173 return newcookie;
198 } 174 }
175 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list);
199 INIT_WORK(&entry->work, async_run_entry_fn); 177 INIT_WORK(&entry->work, async_run_entry_fn);
200 entry->func = ptr; 178 entry->func = ptr;
201 entry->data = data; 179 entry->data = data;
202 entry->running = running; 180 entry->domain = domain;
203 181
204 spin_lock_irqsave(&async_lock, flags); 182 spin_lock_irqsave(&async_lock, flags);
183
184 /* allocate cookie and queue */
205 newcookie = entry->cookie = next_cookie++; 185 newcookie = entry->cookie = next_cookie++;
206 list_add_tail(&entry->list, &async_pending); 186
207 if (running->registered && running->count++ == 0) 187 list_add_tail(&entry->domain_list, &domain->pending);
208 list_add_tail(&running->node, &async_domains); 188 if (domain->registered)
189 list_add_tail(&entry->global_list, &async_global_pending);
190
209 atomic_inc(&entry_count); 191 atomic_inc(&entry_count);
210 spin_unlock_irqrestore(&async_lock, flags); 192 spin_unlock_irqrestore(&async_lock, flags);
211 193
@@ -228,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
228 */ 210 */
229async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 211async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
230{ 212{
231 return __async_schedule(ptr, data, &async_running); 213 return __async_schedule(ptr, data, &async_dfl_domain);
232} 214}
233EXPORT_SYMBOL_GPL(async_schedule); 215EXPORT_SYMBOL_GPL(async_schedule);
234 216
@@ -236,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
236 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
237 * @ptr: function to execute asynchronously 219 * @ptr: function to execute asynchronously
238 * @data: data pointer to pass to the function 220 * @data: data pointer to pass to the function
239 * @running: running list for the domain 221 * @domain: the domain
240 * 222 *
241 * Returns an async_cookie_t that may be used for checkpointing later. 223 * Returns an async_cookie_t that may be used for checkpointing later.
242 * @running may be used in the async_synchronize_*_domain() functions 224 * @domain may be used in the async_synchronize_*_domain() functions to
243 * to wait within a certain synchronization domain rather than globally. 225 * wait within a certain synchronization domain rather than globally. A
244 * A synchronization domain is specified via the running queue @running to use. 226 * synchronization domain is specified via @domain. Note: This function
245 * Note: This function may be called from atomic or non-atomic contexts. 227 * may be called from atomic or non-atomic contexts.
246 */ 228 */
247async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
248 struct async_domain *running) 230 struct async_domain *domain)
249{ 231{
250 return __async_schedule(ptr, data, running); 232 return __async_schedule(ptr, data, domain);
251} 233}
252EXPORT_SYMBOL_GPL(async_schedule_domain); 234EXPORT_SYMBOL_GPL(async_schedule_domain);
253 235
@@ -258,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
258 */ 240 */
259void async_synchronize_full(void) 241void async_synchronize_full(void)
260{ 242{
261 mutex_lock(&async_register_mutex); 243 async_synchronize_full_domain(NULL);
262 do {
263 struct async_domain *domain = NULL;
264
265 spin_lock_irq(&async_lock);
266 if (!list_empty(&async_domains))
267 domain = list_first_entry(&async_domains, typeof(*domain), node);
268 spin_unlock_irq(&async_lock);
269
270 async_synchronize_cookie_domain(next_cookie, domain);
271 } while (!list_empty(&async_domains));
272 mutex_unlock(&async_register_mutex);
273} 244}
274EXPORT_SYMBOL_GPL(async_synchronize_full); 245EXPORT_SYMBOL_GPL(async_synchronize_full);
275 246
@@ -284,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
284 */ 255 */
285void async_unregister_domain(struct async_domain *domain) 256void async_unregister_domain(struct async_domain *domain)
286{ 257{
287 mutex_lock(&async_register_mutex);
288 spin_lock_irq(&async_lock); 258 spin_lock_irq(&async_lock);
289 WARN_ON(!domain->registered || !list_empty(&domain->node) || 259 WARN_ON(!domain->registered || !list_empty(&domain->pending));
290 !list_empty(&domain->domain));
291 domain->registered = 0; 260 domain->registered = 0;
292 spin_unlock_irq(&async_lock); 261 spin_unlock_irq(&async_lock);
293 mutex_unlock(&async_register_mutex);
294} 262}
295EXPORT_SYMBOL_GPL(async_unregister_domain); 263EXPORT_SYMBOL_GPL(async_unregister_domain);
296 264
297/** 265/**
298 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 266 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
299 * @domain: running list to synchronize on 267 * @domain: the domain to synchronize
300 * 268 *
301 * This function waits until all asynchronous function calls for the 269 * This function waits until all asynchronous function calls for the
302 * synchronization domain specified by the running list @domain have been done. 270 * synchronization domain specified by @domain have been done.
303 */ 271 */
304void async_synchronize_full_domain(struct async_domain *domain) 272void async_synchronize_full_domain(struct async_domain *domain)
305{ 273{
306 async_synchronize_cookie_domain(next_cookie, domain); 274 async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
307} 275}
308EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 276EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
309 277
310/** 278/**
311 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing 279 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
312 * @cookie: async_cookie_t to use as checkpoint 280 * @cookie: async_cookie_t to use as checkpoint
313 * @running: running list to synchronize on 281 * @domain: the domain to synchronize (%NULL for all registered domains)
314 * 282 *
315 * This function waits until all asynchronous function calls for the 283 * This function waits until all asynchronous function calls for the
316 * synchronization domain specified by running list @running submitted 284 * synchronization domain specified by @domain submitted prior to @cookie
317 * prior to @cookie have been done. 285 * have been done.
318 */ 286 */
319void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) 287void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
320{ 288{
321 ktime_t uninitialized_var(starttime), delta, endtime; 289 ktime_t uninitialized_var(starttime), delta, endtime;
322 290
323 if (!running)
324 return;
325
326 if (initcall_debug && system_state == SYSTEM_BOOTING) { 291 if (initcall_debug && system_state == SYSTEM_BOOTING) {
327 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 292 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
328 starttime = ktime_get(); 293 starttime = ktime_get();
329 } 294 }
330 295
331 wait_event(async_done, lowest_in_progress(running) >= cookie); 296 wait_event(async_done, lowest_in_progress(domain) >= cookie);
332 297
333 if (initcall_debug && system_state == SYSTEM_BOOTING) { 298 if (initcall_debug && system_state == SYSTEM_BOOTING) {
334 endtime = ktime_get(); 299 endtime = ktime_get();
@@ -350,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
350 */ 315 */
351void async_synchronize_cookie(async_cookie_t cookie) 316void async_synchronize_cookie(async_cookie_t cookie)
352{ 317{
353 async_synchronize_cookie_domain(cookie, &async_running); 318 async_synchronize_cookie_domain(cookie, &async_dfl_domain);
354} 319}
355EXPORT_SYMBOL_GPL(async_synchronize_cookie); 320EXPORT_SYMBOL_GPL(async_synchronize_cookie);
321
322/**
323 * current_is_async - is %current an async worker task?
324 *
325 * Returns %true if %current is an async worker task.
326 */
327bool current_is_async(void)
328{
329 struct worker *worker = current_wq_worker();
330
331 return worker && worker->current_func == async_run_entry_fn;
332}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..b5c64327e712 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
52#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/delayacct.h> 53#include <linux/delayacct.h>
54#include <linux/cgroupstats.h> 54#include <linux/cgroupstats.h>
55#include <linux/hash.h> 55#include <linux/hashtable.h>
56#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
58#include <linux/idr.h> 58#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
376 * account cgroups in empty hierarchies. 376 * account cgroups in empty hierarchies.
377 */ 377 */
378#define CSS_SET_HASH_BITS 7 378#define CSS_SET_HASH_BITS 7
379#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 379static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
380static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
381 380
382static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 381static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383{ 382{
384 int i; 383 int i;
385 int index; 384 unsigned long key = 0UL;
386 unsigned long tmp = 0UL;
387 385
388 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
389 tmp += (unsigned long)css[i]; 387 key += (unsigned long)css[i];
390 tmp = (tmp >> 16) ^ tmp; 388 key = (key >> 16) ^ key;
391 389
392 index = hash_long(tmp, CSS_SET_HASH_BITS); 390 return key;
393
394 return &css_set_table[index];
395} 391}
396 392
397/* We don't maintain the lists running through each css_set to its 393/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
418 } 414 }
419 415
420 /* This css_set is dead. unlink it and release cgroup refcounts */ 416 /* This css_set is dead. unlink it and release cgroup refcounts */
421 hlist_del(&cg->hlist); 417 hash_del(&cg->hlist);
422 css_set_count--; 418 css_set_count--;
423 419
424 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 420 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
426 struct cgroup *cgrp = link->cgrp; 422 struct cgroup *cgrp = link->cgrp;
427 list_del(&link->cg_link_list); 423 list_del(&link->cg_link_list);
428 list_del(&link->cgrp_link_list); 424 list_del(&link->cgrp_link_list);
425
426 /*
427 * We may not be holding cgroup_mutex, and if cgrp->count is
428 * dropped to 0 the cgroup can be destroyed at any time, hence
429 * rcu_read_lock is used to keep it alive.
430 */
431 rcu_read_lock();
429 if (atomic_dec_and_test(&cgrp->count) && 432 if (atomic_dec_and_test(&cgrp->count) &&
430 notify_on_release(cgrp)) { 433 notify_on_release(cgrp)) {
431 if (taskexit) 434 if (taskexit)
432 set_bit(CGRP_RELEASABLE, &cgrp->flags); 435 set_bit(CGRP_RELEASABLE, &cgrp->flags);
433 check_for_release(cgrp); 436 check_for_release(cgrp);
434 } 437 }
438 rcu_read_unlock();
435 439
436 kfree(link); 440 kfree(link);
437 } 441 }
@@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(
550{ 554{
551 int i; 555 int i;
552 struct cgroupfs_root *root = cgrp->root; 556 struct cgroupfs_root *root = cgrp->root;
553 struct hlist_head *hhead;
554 struct hlist_node *node; 557 struct hlist_node *node;
555 struct css_set *cg; 558 struct css_set *cg;
559 unsigned long key;
556 560
557 /* 561 /*
558 * Build the set of subsystem state objects that we want to see in the 562 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(
572 } 576 }
573 } 577 }
574 578
575 hhead = css_set_hash(template); 579 key = css_set_hash(template);
576 hlist_for_each_entry(cg, node, hhead, hlist) { 580 hash_for_each_possible(css_set_table, cg, node, hlist, key) {
577 if (!compare_css_sets(cg, oldcg, cgrp, template)) 581 if (!compare_css_sets(cg, oldcg, cgrp, template))
578 continue; 582 continue;
579 583
@@ -657,8 +661,8 @@ static struct css_set *find_css_set(
657 661
658 struct list_head tmp_cg_links; 662 struct list_head tmp_cg_links;
659 663
660 struct hlist_head *hhead;
661 struct cg_cgroup_link *link; 664 struct cg_cgroup_link *link;
665 unsigned long key;
662 666
663 /* First see if we already have a cgroup group that matches 667 /* First see if we already have a cgroup group that matches
664 * the desired set */ 668 * the desired set */
@@ -704,8 +708,8 @@ static struct css_set *find_css_set(
704 css_set_count++; 708 css_set_count++;
705 709
706 /* Add this cgroup group to the hash table */ 710 /* Add this cgroup group to the hash table */
707 hhead = css_set_hash(res->subsys); 711 key = css_set_hash(res->subsys);
708 hlist_add_head(&res->hlist, hhead); 712 hash_add(css_set_table, &res->hlist, key);
709 713
710 write_unlock(&css_set_lock); 714 write_unlock(&css_set_lock);
711 715
@@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
856 return inode; 860 return inode;
857} 861}
858 862
859static void cgroup_diput(struct dentry *dentry, struct inode *inode) 863static void cgroup_free_fn(struct work_struct *work)
860{ 864{
861 /* is dentry a directory ? if so, kfree() associated cgroup */ 865 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
862 if (S_ISDIR(inode->i_mode)) { 866 struct cgroup_subsys *ss;
863 struct cgroup *cgrp = dentry->d_fsdata;
864 struct cgroup_subsys *ss;
865 BUG_ON(!(cgroup_is_removed(cgrp)));
866 /* It's possible for external users to be holding css
867 * reference counts on a cgroup; css_put() needs to
868 * be able to access the cgroup after decrementing
869 * the reference count in order to know if it needs to
870 * queue the cgroup to be handled by the release
871 * agent */
872 synchronize_rcu();
873 867
874 mutex_lock(&cgroup_mutex); 868 mutex_lock(&cgroup_mutex);
875 /* 869 /*
876 * Release the subsystem state objects. 870 * Release the subsystem state objects.
877 */ 871 */
878 for_each_subsys(cgrp->root, ss) 872 for_each_subsys(cgrp->root, ss)
879 ss->css_free(cgrp); 873 ss->css_free(cgrp);
880 874
881 cgrp->root->number_of_cgroups--; 875 cgrp->root->number_of_cgroups--;
882 mutex_unlock(&cgroup_mutex); 876 mutex_unlock(&cgroup_mutex);
883 877
884 /* 878 /*
885 * Drop the active superblock reference that we took when we 879 * Drop the active superblock reference that we took when we
886 * created the cgroup 880 * created the cgroup
887 */ 881 */
888 deactivate_super(cgrp->root->sb); 882 deactivate_super(cgrp->root->sb);
889 883
890 /* 884 /*
891 * if we're getting rid of the cgroup, refcount should ensure 885 * if we're getting rid of the cgroup, refcount should ensure
892 * that there are no pidlists left. 886 * that there are no pidlists left.
893 */ 887 */
894 BUG_ON(!list_empty(&cgrp->pidlists)); 888 BUG_ON(!list_empty(&cgrp->pidlists));
895 889
896 simple_xattrs_free(&cgrp->xattrs); 890 simple_xattrs_free(&cgrp->xattrs);
897 891
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 892 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
899 kfree_rcu(cgrp, rcu_head); 893 kfree(cgrp);
894}
895
896static void cgroup_free_rcu(struct rcu_head *head)
897{
898 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
899
900 schedule_work(&cgrp->free_work);
901}
902
903static void cgroup_diput(struct dentry *dentry, struct inode *inode)
904{
905 /* is dentry a directory ? if so, kfree() associated cgroup */
906 if (S_ISDIR(inode->i_mode)) {
907 struct cgroup *cgrp = dentry->d_fsdata;
908
909 BUG_ON(!(cgroup_is_removed(cgrp)));
910 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
900 } else { 911 } else {
901 struct cfent *cfe = __d_cfe(dentry); 912 struct cfent *cfe = __d_cfe(dentry);
902 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 913 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)
925 dput(parent); 936 dput(parent);
926} 937}
927 938
928static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 939static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
929{ 940{
930 struct cfent *cfe; 941 struct cfent *cfe;
931 942
932 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 943 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
933 lockdep_assert_held(&cgroup_mutex); 944 lockdep_assert_held(&cgroup_mutex);
934 945
946 /*
947 * If we're doing cleanup due to failure of cgroup_create(),
948 * the corresponding @cfe may not exist.
949 */
935 list_for_each_entry(cfe, &cgrp->files, node) { 950 list_for_each_entry(cfe, &cgrp->files, node) {
936 struct dentry *d = cfe->dentry; 951 struct dentry *d = cfe->dentry;
937 952
@@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
944 list_del_init(&cfe->node); 959 list_del_init(&cfe->node);
945 dput(d); 960 dput(d);
946 961
947 return 0; 962 break;
948 } 963 }
949 return -ENOENT;
950} 964}
951 965
952/** 966/**
@@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1083 } 1097 }
1084 } 1098 }
1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1099 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1086 synchronize_rcu();
1087 1100
1088 return 0; 1101 return 0;
1089} 1102}
@@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1393 INIT_LIST_HEAD(&cgrp->allcg_node); 1406 INIT_LIST_HEAD(&cgrp->allcg_node);
1394 INIT_LIST_HEAD(&cgrp->release_list); 1407 INIT_LIST_HEAD(&cgrp->release_list);
1395 INIT_LIST_HEAD(&cgrp->pidlists); 1408 INIT_LIST_HEAD(&cgrp->pidlists);
1409 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1396 mutex_init(&cgrp->pidlist_mutex); 1410 mutex_init(&cgrp->pidlist_mutex);
1397 INIT_LIST_HEAD(&cgrp->event_list); 1411 INIT_LIST_HEAD(&cgrp->event_list);
1398 spin_lock_init(&cgrp->event_list_lock); 1412 spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 struct cgroupfs_root *existing_root; 1611 struct cgroupfs_root *existing_root;
1598 const struct cred *cred; 1612 const struct cred *cred;
1599 int i; 1613 int i;
1614 struct hlist_node *node;
1615 struct css_set *cg;
1600 1616
1601 BUG_ON(sb->s_root != NULL); 1617 BUG_ON(sb->s_root != NULL);
1602 1618
@@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1650 /* Link the top cgroup in this hierarchy into all 1666 /* Link the top cgroup in this hierarchy into all
1651 * the css_set objects */ 1667 * the css_set objects */
1652 write_lock(&css_set_lock); 1668 write_lock(&css_set_lock);
1653 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1669 hash_for_each(css_set_table, i, node, cg, hlist)
1654 struct hlist_head *hhead = &css_set_table[i]; 1670 link_css_set(&tmp_cg_links, cg, root_cgrp);
1655 struct hlist_node *node;
1656 struct css_set *cg;
1657
1658 hlist_for_each_entry(cg, node, hhead, hlist)
1659 link_css_set(&tmp_cg_links, cg, root_cgrp);
1660 }
1661 write_unlock(&css_set_lock); 1671 write_unlock(&css_set_lock);
1662 1672
1663 free_cg_links(&tmp_cg_links); 1673 free_cg_links(&tmp_cg_links);
@@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1783 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking"); 1784 "cgroup_path() called without proper locking");
1775 1785
1776 if (!dentry || cgrp == dummytop) { 1786 if (cgrp == dummytop) {
1777 /* 1787 /*
1778 * Inactive subsystems have no dentry for their root 1788 * Inactive subsystems have no dentry for their root
1779 * cgroup 1789 * cgroup
@@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1982 ss->attach(cgrp, &tset); 1992 ss->attach(cgrp, &tset);
1983 } 1993 }
1984 1994
1985 synchronize_rcu();
1986out: 1995out:
1987 if (retval) { 1996 if (retval) {
1988 for_each_subsys(root, ss) { 1997 for_each_subsys(root, ss) {
@@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2151 /* 2160 /*
2152 * step 5: success! and cleanup 2161 * step 5: success! and cleanup
2153 */ 2162 */
2154 synchronize_rcu();
2155 retval = 0; 2163 retval = 0;
2156out_put_css_set_refs: 2164out_put_css_set_refs:
2157 if (retval) { 2165 if (retval) {
@@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2777 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue; 2778 continue;
2771 2779
2772 if (is_add) 2780 if (is_add) {
2773 err = cgroup_add_file(cgrp, subsys, cft); 2781 err = cgroup_add_file(cgrp, subsys, cft);
2774 else 2782 if (err)
2775 err = cgroup_rm_file(cgrp, cft); 2783 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2776 if (err) { 2784 cft->name, err);
2777 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2778 is_add ? "add" : "remove", cft->name, err);
2779 ret = err; 2785 ret = err;
2786 } else {
2787 cgroup_rm_file(cgrp, cft);
2780 } 2788 }
2781 } 2789 }
2782 return ret; 2790 return ret;
@@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
3017} 3025}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3026EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019 3027
3028/**
3029 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
3030 * @pos: cgroup of interest
3031 *
3032 * Return the rightmost descendant of @pos. If there's no descendant,
3033 * @pos is returned. This can be used during pre-order traversal to skip
3034 * subtree of @pos.
3035 */
3036struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3037{
3038 struct cgroup *last, *tmp;
3039
3040 WARN_ON_ONCE(!rcu_read_lock_held());
3041
3042 do {
3043 last = pos;
3044 /* ->prev isn't RCU safe, walk ->next till the end */
3045 pos = NULL;
3046 list_for_each_entry_rcu(tmp, &last->children, sibling)
3047 pos = tmp;
3048 } while (pos);
3049
3050 return last;
3051}
3052EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
3053
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3054static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{ 3055{
3022 struct cgroup *last; 3056 struct cgroup *last;
@@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)
3752 remove); 3786 remove);
3753 struct cgroup *cgrp = event->cgrp; 3787 struct cgroup *cgrp = event->cgrp;
3754 3788
3789 remove_wait_queue(event->wqh, &event->wait);
3790
3755 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3791 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3756 3792
3793 /* Notify userspace the event is going away. */
3794 eventfd_signal(event->eventfd, 1);
3795
3757 eventfd_ctx_put(event->eventfd); 3796 eventfd_ctx_put(event->eventfd);
3758 kfree(event); 3797 kfree(event);
3759 dput(cgrp->dentry); 3798 dput(cgrp->dentry);
@@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3773 unsigned long flags = (unsigned long)key; 3812 unsigned long flags = (unsigned long)key;
3774 3813
3775 if (flags & POLLHUP) { 3814 if (flags & POLLHUP) {
3776 __remove_wait_queue(event->wqh, &event->wait);
3777 spin_lock(&cgrp->event_list_lock);
3778 list_del_init(&event->list);
3779 spin_unlock(&cgrp->event_list_lock);
3780 /* 3815 /*
3781 * We are in atomic context, but cgroup_event_remove() may 3816 * If the event has been detached at cgroup removal, we
3782 * sleep, so we have to call it in workqueue. 3817 * can simply return knowing the other side will cleanup
3818 * for us.
3819 *
3820 * We can't race against event freeing since the other
3821 * side will require wqh->lock via remove_wait_queue(),
3822 * which we hold.
3783 */ 3823 */
3784 schedule_work(&event->remove); 3824 spin_lock(&cgrp->event_list_lock);
3825 if (!list_empty(&event->list)) {
3826 list_del_init(&event->list);
3827 /*
3828 * We are in atomic context, but cgroup_event_remove()
3829 * may sleep, so we have to call it in workqueue.
3830 */
3831 schedule_work(&event->remove);
3832 }
3833 spin_unlock(&cgrp->event_list_lock);
3785 } 3834 }
3786 3835
3787 return 0; 3836 return 0;
@@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3807 const char *buffer) 3856 const char *buffer)
3808{ 3857{
3809 struct cgroup_event *event = NULL; 3858 struct cgroup_event *event = NULL;
3859 struct cgroup *cgrp_cfile;
3810 unsigned int efd, cfd; 3860 unsigned int efd, cfd;
3811 struct file *efile = NULL; 3861 struct file *efile = NULL;
3812 struct file *cfile = NULL; 3862 struct file *cfile = NULL;
@@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3862 goto fail; 3912 goto fail;
3863 } 3913 }
3864 3914
3915 /*
3916 * The file to be monitored must be in the same cgroup as
3917 * cgroup.event_control is.
3918 */
3919 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3920 if (cgrp_cfile != cgrp) {
3921 ret = -EINVAL;
3922 goto fail;
3923 }
3924
3865 if (!event->cft->register_event || !event->cft->unregister_event) { 3925 if (!event->cft->register_event || !event->cft->unregister_event) {
3866 ret = -EINVAL; 3926 ret = -EINVAL;
3867 goto fail; 3927 goto fail;
@@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4135 4195
4136 init_cgroup_housekeeping(cgrp); 4196 init_cgroup_housekeeping(cgrp);
4137 4197
4198 dentry->d_fsdata = cgrp;
4199 cgrp->dentry = dentry;
4200
4138 cgrp->parent = parent; 4201 cgrp->parent = parent;
4139 cgrp->root = parent->root; 4202 cgrp->root = parent->root;
4140 cgrp->top_cgroup = parent->top_cgroup; 4203 cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4172 lockdep_assert_held(&dentry->d_inode->i_mutex); 4235 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173 4236
4174 /* allocation complete, commit to creation */ 4237 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4238 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4239 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++; 4240 root->number_of_cgroups++;
@@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 /* 4401 /*
4341 * Unregister events and notify userspace. 4402 * Unregister events and notify userspace.
4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4403 * Notify userspace about cgroup removing only after rmdir of cgroup
4343 * directory to avoid race between userspace and kernelspace. Use 4404 * directory to avoid race between userspace and kernelspace.
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4347 */ 4405 */
4348 spin_lock(&cgrp->event_list_lock); 4406 spin_lock(&cgrp->event_list_lock);
4349 list_splice_init(&cgrp->event_list, &tmp_list); 4407 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list); 4408 list_del_init(&event->list);
4353 remove_wait_queue(event->wqh, &event->wait);
4354 eventfd_signal(event->eventfd, 1);
4355 schedule_work(&event->remove); 4409 schedule_work(&event->remove);
4356 } 4410 }
4411 spin_unlock(&cgrp->event_list_lock);
4357 4412
4358 return 0; 4413 return 0;
4359} 4414}
@@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4438{ 4493{
4439 struct cgroup_subsys_state *css; 4494 struct cgroup_subsys_state *css;
4440 int i, ret; 4495 int i, ret;
4496 struct hlist_node *node, *tmp;
4497 struct css_set *cg;
4498 unsigned long key;
4441 4499
4442 /* check name and function validity */ 4500 /* check name and function validity */
4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4501 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4503 * this is all done under the css_set_lock. 4561 * this is all done under the css_set_lock.
4504 */ 4562 */
4505 write_lock(&css_set_lock); 4563 write_lock(&css_set_lock);
4506 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4564 hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
4507 struct css_set *cg; 4565 /* skip entries that we already rehashed */
4508 struct hlist_node *node, *tmp; 4566 if (cg->subsys[ss->subsys_id])
4509 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4567 continue;
4510 4568 /* remove existing entry */
4511 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4569 hash_del(&cg->hlist);
4512 /* skip entries that we already rehashed */ 4570 /* set new value */
4513 if (cg->subsys[ss->subsys_id]) 4571 cg->subsys[ss->subsys_id] = css;
4514 continue; 4572 /* recompute hash and restore entry */
4515 /* remove existing entry */ 4573 key = css_set_hash(cg->subsys);
4516 hlist_del(&cg->hlist); 4574 hash_add(css_set_table, node, key);
4517 /* set new value */
4518 cg->subsys[ss->subsys_id] = css;
4519 /* recompute hash and restore entry */
4520 new_bucket = css_set_hash(cg->subsys);
4521 hlist_add_head(&cg->hlist, new_bucket);
4522 }
4523 } 4575 }
4524 write_unlock(&css_set_lock); 4576 write_unlock(&css_set_lock);
4525 4577
@@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4551void cgroup_unload_subsys(struct cgroup_subsys *ss) 4603void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552{ 4604{
4553 struct cg_cgroup_link *link; 4605 struct cg_cgroup_link *link;
4554 struct hlist_head *hhead;
4555 4606
4556 BUG_ON(ss->module == NULL); 4607 BUG_ON(ss->module == NULL);
4557 4608
@@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4585 write_lock(&css_set_lock); 4636 write_lock(&css_set_lock);
4586 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4637 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4587 struct css_set *cg = link->cg; 4638 struct css_set *cg = link->cg;
4639 unsigned long key;
4588 4640
4589 hlist_del(&cg->hlist); 4641 hash_del(&cg->hlist);
4590 cg->subsys[ss->subsys_id] = NULL; 4642 cg->subsys[ss->subsys_id] = NULL;
4591 hhead = css_set_hash(cg->subsys); 4643 key = css_set_hash(cg->subsys);
4592 hlist_add_head(&cg->hlist, hhead); 4644 hash_add(css_set_table, &cg->hlist, key);
4593 } 4645 }
4594 write_unlock(&css_set_lock); 4646 write_unlock(&css_set_lock);
4595 4647
@@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)
4631 list_add(&init_css_set_link.cg_link_list, 4683 list_add(&init_css_set_link.cg_link_list,
4632 &init_css_set.cg_links); 4684 &init_css_set.cg_links);
4633 4685
4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4635 INIT_HLIST_HEAD(&css_set_table[i]);
4636
4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4686 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4638 struct cgroup_subsys *ss = subsys[i]; 4687 struct cgroup_subsys *ss = subsys[i];
4639 4688
@@ -4667,7 +4716,7 @@ int __init cgroup_init(void)
4667{ 4716{
4668 int err; 4717 int err;
4669 int i; 4718 int i;
4670 struct hlist_head *hhead; 4719 unsigned long key;
4671 4720
4672 err = bdi_init(&cgroup_backing_dev_info); 4721 err = bdi_init(&cgroup_backing_dev_info);
4673 if (err) 4722 if (err)
@@ -4686,8 +4735,8 @@ int __init cgroup_init(void)
4686 } 4735 }
4687 4736
4688 /* Add init_css_set to the hash table */ 4737 /* Add init_css_set to the hash table */
4689 hhead = css_set_hash(init_css_set.subsys); 4738 key = css_set_hash(init_css_set.subsys);
4690 hlist_add_head(&init_css_set.hlist, hhead); 4739 hash_add(css_set_table, &init_css_set.hlist, key);
4691 BUG_ON(!init_root_id(&rootnode)); 4740 BUG_ON(!init_root_id(&rootnode));
4692 4741
4693 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4742 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4982 } 5031 }
4983 task_unlock(tsk); 5032 task_unlock(tsk);
4984 5033
4985 if (cg) 5034 put_css_set_taskexit(cg);
4986 put_css_set_taskexit(cg);
4987} 5035}
4988 5036
4989/** 5037/**
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd55508..65349f07b878 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,29 +1,41 @@
1/*
2 * Context tracking: Probe on high level context boundaries such as kernel
3 * and userspace. This includes syscalls and exceptions entry/exit.
4 *
5 * This is used by RCU to remove its dependency on the timer tick while a CPU
6 * runs in userspace.
7 *
8 * Started by Frederic Weisbecker:
9 *
10 * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
11 *
12 * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
13 * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
14 *
15 */
16
1#include <linux/context_tracking.h> 17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
2#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
3#include <linux/sched.h> 20#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/export.h>
6 23
7struct context_tracking { 24DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE 25#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true, 26 .active = true,
24#endif 27#endif
25}; 28};
26 29
30/**
31 * user_enter - Inform the context tracking that the CPU is going to
32 * enter userspace mode.
33 *
34 * This function must be called right before we switch from the kernel
35 * to userspace, when it's guaranteed the remaining kernel instructions
36 * to execute won't use any RCU read side critical section because this
37 * function sets RCU in extended quiescent state.
38 */
27void user_enter(void) 39void user_enter(void)
28{ 40{
29 unsigned long flags; 41 unsigned long flags;
@@ -39,40 +51,90 @@ void user_enter(void)
39 if (in_interrupt()) 51 if (in_interrupt())
40 return; 52 return;
41 53
54 /* Kernel threads aren't supposed to go to userspace */
42 WARN_ON_ONCE(!current->mm); 55 WARN_ON_ONCE(!current->mm);
43 56
44 local_irq_save(flags); 57 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) && 58 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) { 59 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER); 60 /*
61 * At this stage, only low level arch entry code remains and
62 * then we'll run in userspace. We can assume there won't be
63 * any RCU read-side critical section until the next call to
64 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
65 * on the tick.
66 */
67 vtime_user_enter(current);
48 rcu_user_enter(); 68 rcu_user_enter();
69 __this_cpu_write(context_tracking.state, IN_USER);
49 } 70 }
50 local_irq_restore(flags); 71 local_irq_restore(flags);
51} 72}
52 73
74
75/**
76 * user_exit - Inform the context tracking that the CPU is
77 * exiting userspace mode and entering the kernel.
78 *
79 * This function must be called after we entered the kernel from userspace
80 * before any use of RCU read side critical section. This potentially include
81 * any high level kernel code like syscalls, exceptions, signal handling, etc...
82 *
83 * This call supports re-entrancy. This way it can be called from any exception
84 * handler without needing to know if we came from userspace or not.
85 */
53void user_exit(void) 86void user_exit(void)
54{ 87{
55 unsigned long flags; 88 unsigned long flags;
56 89
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt()) 90 if (in_interrupt())
66 return; 91 return;
67 92
68 local_irq_save(flags); 93 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) { 94 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL); 95 /*
96 * We are going to run code that may use RCU. Inform
97 * RCU core about that (ie: we may need the tick again).
98 */
71 rcu_user_exit(); 99 rcu_user_exit();
100 vtime_user_exit(current);
101 __this_cpu_write(context_tracking.state, IN_KERNEL);
72 } 102 }
73 local_irq_restore(flags); 103 local_irq_restore(flags);
74} 104}
75 105
106void guest_enter(void)
107{
108 if (vtime_accounting_enabled())
109 vtime_guest_enter(current);
110 else
111 __guest_enter();
112}
113EXPORT_SYMBOL_GPL(guest_enter);
114
115void guest_exit(void)
116{
117 if (vtime_accounting_enabled())
118 vtime_guest_exit(current);
119 else
120 __guest_exit();
121}
122EXPORT_SYMBOL_GPL(guest_exit);
123
124
125/**
126 * context_tracking_task_switch - context switch the syscall callbacks
127 * @prev: the task that is being switched out
128 * @next: the task that is being switched in
129 *
130 * The context tracking uses the syscall slow path to implement its user-kernel
131 * boundaries probes on syscalls. This way it doesn't impact the syscall fast
132 * path on CPUs that don't do context tracking.
133 *
134 * But we need to clear the flag on the previous task because it may later
135 * migrate to some CPU that doesn't do the context tracking. As such the TIF
136 * flag may not be desired there.
137 */
76void context_tracking_task_switch(struct task_struct *prev, 138void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next) 139 struct task_struct *next)
78{ 140{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242c..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
224static inline void check_for_tasks(int cpu) 224static inline void check_for_tasks(int cpu)
225{ 225{
226 struct task_struct *p; 226 struct task_struct *p;
227 cputime_t utime, stime;
227 228
228 write_lock_irq(&tasklist_lock); 229 write_lock_irq(&tasklist_lock);
229 for_each_process(p) { 230 for_each_process(p) {
231 task_cputime(p, &utime, &stime);
230 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 232 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
231 (p->utime || p->stime)) 233 (utime || stime))
232 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 234 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
233 "(state = %ld, flags = %x)\n", 235 "(state = %ld, flags = %x)\n",
234 p->comm, task_pid_nr(p), cpu, 236 p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
254 return err; 256 return err;
255 257
256 cpu_notify(CPU_DYING | param->mod, param->hcpu); 258 cpu_notify(CPU_DYING | param->mod, param->hcpu);
259 /* Park the stopper thread */
260 kthread_park(current);
257 return 0; 261 return 0;
258} 262}
259 263
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
72 * Tracks how many cpusets are currently defined in system. 64 * Tracks how many cpusets are currently defined in system.
73 * When there is only one cpuset (the root cpuset) we can 65 * When there is only one cpuset (the root cpuset) we can
74 * short circuit some hooks. 66 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
97 89
98 struct cpuset *parent; /* my parent */
99
100 struct fmeter fmeter; /* memory_pressure filter */ 90 struct fmeter fmeter; /* memory_pressure filter */
101 91
92 /*
93 * Tasks are being attached to this cpuset. Used to prevent
94 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
95 */
96 int attach_in_progress;
97
102 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
103 int pn; 99 int pn;
104 100
105 /* for custom sched domain */ 101 /* for custom sched domain */
106 int relax_domain_level; 102 int relax_domain_level;
107 103
108 /* used for walking a cpuset hierarchy */ 104 struct work_struct hotplug_work;
109 struct list_head stack_list;
110}; 105};
111 106
112/* Retrieve the cpuset for a cgroup */ 107/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 118 struct cpuset, css);
124} 119}
125 120
121static inline struct cpuset *parent_cs(const struct cpuset *cs)
122{
123 struct cgroup *pcgrp = cs->css.cgroup->parent;
124
125 if (pcgrp)
126 return cgroup_cs(pcgrp);
127 return NULL;
128}
129
126#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task) 131static inline bool task_has_mempolicy(struct task_struct *task)
128{ 132{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
138 142
139/* bits in struct cpuset flags field */ 143/* bits in struct cpuset flags field */
140typedef enum { 144typedef enum {
145 CS_ONLINE,
141 CS_CPU_EXCLUSIVE, 146 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE, 147 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL, 148 CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 152 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 153} cpuset_flagbits_t;
149 154
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156/* convenient tests for these bits */ 155/* convenient tests for these bits */
156static inline bool is_cpuset_online(const struct cpuset *cs)
157{
158 return test_bit(CS_ONLINE, &cs->flags);
159}
160
157static inline int is_cpu_exclusive(const struct cpuset *cs) 161static inline int is_cpu_exclusive(const struct cpuset *cs)
158{ 162{
159 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 163 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
190} 194}
191 195
192static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
193 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
198 (1 << CS_MEM_EXCLUSIVE)),
194}; 199};
195 200
201/**
202 * cpuset_for_each_child - traverse online children of a cpuset
203 * @child_cs: loop cursor pointing to the current child
204 * @pos_cgrp: used for iteration
205 * @parent_cs: target cpuset to walk children of
206 *
207 * Walk @child_cs through the online children of @parent_cs. Must be used
208 * with RCU read locked.
209 */
210#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
211 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
212 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
213
214/**
215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
216 * @des_cs: loop cursor pointing to the current descendant
217 * @pos_cgrp: used for iteration
218 * @root_cs: target cpuset to walk ancestor of
219 *
220 * Walk @des_cs through the online descendants of @root_cs. Must be used
221 * with RCU read locked. The caller may modify @pos_cgrp by calling
222 * cgroup_rightmost_descendant() to skip subtree.
223 */
224#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
225 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
226 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
227
196/* 228/*
197 * There are two global mutexes guarding cpuset structures. The first 229 * There are two global mutexes guarding cpuset structures - cpuset_mutex
198 * is the main control groups cgroup_mutex, accessed via 230 * and callback_mutex. The latter may nest inside the former. We also
199 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 231 * require taking task_lock() when dereferencing a task's cpuset pointer.
200 * callback_mutex, below. They can nest. It is ok to first take 232 * See "The task_lock() exception", at the end of this comment.
201 * cgroup_mutex, then nest callback_mutex. We also require taking 233 *
202 * task_lock() when dereferencing a task's cpuset pointer. See "The 234 * A task must hold both mutexes to modify cpusets. If a task holds
203 * task_lock() exception", at the end of this comment. 235 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
204 * 236 * is the only task able to also acquire callback_mutex and be able to
205 * A task must hold both mutexes to modify cpusets. If a task 237 * modify cpusets. It can perform various checks on the cpuset structure
206 * holds cgroup_mutex, then it blocks others wanting that mutex, 238 * first, knowing nothing will change. It can also allocate memory while
207 * ensuring that it is the only task able to also acquire callback_mutex 239 * just holding cpuset_mutex. While it is performing these checks, various
208 * and be able to modify cpusets. It can perform various checks on 240 * callback routines can briefly acquire callback_mutex to query cpusets.
209 * the cpuset structure first, knowing nothing will change. It can 241 * Once it is ready to make the changes, it takes callback_mutex, blocking
210 * also allocate memory while just holding cgroup_mutex. While it is 242 * everyone else.
211 * performing these checks, various callback routines can briefly
212 * acquire callback_mutex to query cpusets. Once it is ready to make
213 * the changes, it takes callback_mutex, blocking everyone else.
214 * 243 *
215 * Calls to the kernel memory allocator can not be made while holding 244 * Calls to the kernel memory allocator can not be made while holding
216 * callback_mutex, as that would risk double tripping on callback_mutex 245 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
232 * guidelines for accessing subsystem state in kernel/cgroup.c 261 * guidelines for accessing subsystem state in kernel/cgroup.c
233 */ 262 */
234 263
264static DEFINE_MUTEX(cpuset_mutex);
235static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
236 266
237/* 267/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
246static DEFINE_SPINLOCK(cpuset_buffer_lock); 276static DEFINE_SPINLOCK(cpuset_buffer_lock);
247 277
248/* 278/*
279 * CPU / memory hotplug is handled asynchronously.
280 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq;
282
283static void cpuset_hotplug_workfn(struct work_struct *work);
284static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
285static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
286
287static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
288
289/*
249 * This is ugly, but preserves the userspace API for existing cpuset 290 * This is ugly, but preserves the userspace API for existing cpuset
250 * users. If someone tries to mount the "cpuset" filesystem, we 291 * users. If someone tries to mount the "cpuset" filesystem, we
251 * silently switch it to mount "cgroup" instead 292 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
289 struct cpumask *pmask) 330 struct cpumask *pmask)
290{ 331{
291 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 332 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
292 cs = cs->parent; 333 cs = parent_cs(cs);
293 if (cs) 334 if (cs)
294 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 335 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
295 else 336 else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 355{
315 while (cs && !nodes_intersects(cs->mems_allowed, 356 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_MEMORY])) 357 node_states[N_MEMORY]))
317 cs = cs->parent; 358 cs = parent_cs(cs);
318 if (cs) 359 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 360 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_MEMORY]); 361 node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
326/* 367/*
327 * update task's spread flag if cpuset's page/slab spread flag is set 368 * update task's spread flag if cpuset's page/slab spread flag is set
328 * 369 *
329 * Called with callback_mutex/cgroup_mutex held 370 * Called with callback_mutex/cpuset_mutex held
330 */ 371 */
331static void cpuset_update_task_spread_flag(struct cpuset *cs, 372static void cpuset_update_task_spread_flag(struct cpuset *cs,
332 struct task_struct *tsk) 373 struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
346 * 387 *
347 * One cpuset is a subset of another if all its allowed CPUs and 388 * One cpuset is a subset of another if all its allowed CPUs and
348 * Memory Nodes are a subset of the other, and its exclusive flags 389 * Memory Nodes are a subset of the other, and its exclusive flags
349 * are only set if the other's are set. Call holding cgroup_mutex. 390 * are only set if the other's are set. Call holding cpuset_mutex.
350 */ 391 */
351 392
352static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 393static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
395 * If we replaced the flag and mask values of the current cpuset 436 * If we replaced the flag and mask values of the current cpuset
396 * (cur) with those values in the trial cpuset (trial), would 437 * (cur) with those values in the trial cpuset (trial), would
397 * our various subset and exclusive rules still be valid? Presumes 438 * our various subset and exclusive rules still be valid? Presumes
398 * cgroup_mutex held. 439 * cpuset_mutex held.
399 * 440 *
400 * 'cur' is the address of an actual, in-use cpuset. Operations 441 * 'cur' is the address of an actual, in-use cpuset. Operations
401 * such as list traversal that depend on the actual address of the 442 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
412{ 453{
413 struct cgroup *cont; 454 struct cgroup *cont;
414 struct cpuset *c, *par; 455 struct cpuset *c, *par;
456 int ret;
457
458 rcu_read_lock();
415 459
416 /* Each of our child cpusets must be a subset of us */ 460 /* Each of our child cpusets must be a subset of us */
417 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 461 ret = -EBUSY;
418 if (!is_cpuset_subset(cgroup_cs(cont), trial)) 462 cpuset_for_each_child(c, cont, cur)
419 return -EBUSY; 463 if (!is_cpuset_subset(c, trial))
420 } 464 goto out;
421 465
422 /* Remaining checks don't apply to root cpuset */ 466 /* Remaining checks don't apply to root cpuset */
467 ret = 0;
423 if (cur == &top_cpuset) 468 if (cur == &top_cpuset)
424 return 0; 469 goto out;
425 470
426 par = cur->parent; 471 par = parent_cs(cur);
427 472
428 /* We must be a subset of our parent cpuset */ 473 /* We must be a subset of our parent cpuset */
474 ret = -EACCES;
429 if (!is_cpuset_subset(trial, par)) 475 if (!is_cpuset_subset(trial, par))
430 return -EACCES; 476 goto out;
431 477
432 /* 478 /*
433 * If either I or some sibling (!= me) is exclusive, we can't 479 * If either I or some sibling (!= me) is exclusive, we can't
434 * overlap 480 * overlap
435 */ 481 */
436 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 482 ret = -EINVAL;
437 c = cgroup_cs(cont); 483 cpuset_for_each_child(c, cont, par) {
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 484 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
439 c != cur && 485 c != cur &&
440 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 486 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
441 return -EINVAL; 487 goto out;
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 488 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
443 c != cur && 489 c != cur &&
444 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 490 nodes_intersects(trial->mems_allowed, c->mems_allowed))
445 return -EINVAL; 491 goto out;
446 } 492 }
447 493
448 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 494 /*
449 if (cgroup_task_count(cur->css.cgroup)) { 495 * Cpusets with tasks - existing or newly being attached - can't
450 if (cpumask_empty(trial->cpus_allowed) || 496 * have empty cpus_allowed or mems_allowed.
451 nodes_empty(trial->mems_allowed)) { 497 */
452 return -ENOSPC; 498 ret = -ENOSPC;
453 } 499 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
454 } 500 (cpumask_empty(trial->cpus_allowed) ||
501 nodes_empty(trial->mems_allowed)))
502 goto out;
455 503
456 return 0; 504 ret = 0;
505out:
506 rcu_read_unlock();
507 return ret;
457} 508}
458 509
459#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
474 return; 525 return;
475} 526}
476 527
477static void 528static void update_domain_attr_tree(struct sched_domain_attr *dattr,
478update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 529 struct cpuset *root_cs)
479{ 530{
480 LIST_HEAD(q); 531 struct cpuset *cp;
481 532 struct cgroup *pos_cgrp;
482 list_add(&c->stack_list, &q);
483 while (!list_empty(&q)) {
484 struct cpuset *cp;
485 struct cgroup *cont;
486 struct cpuset *child;
487
488 cp = list_first_entry(&q, struct cpuset, stack_list);
489 list_del(q.next);
490 533
491 if (cpumask_empty(cp->cpus_allowed)) 534 rcu_read_lock();
535 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
536 /* skip the whole subtree if @cp doesn't have any CPU */
537 if (cpumask_empty(cp->cpus_allowed)) {
538 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
492 continue; 539 continue;
540 }
493 541
494 if (is_sched_load_balance(cp)) 542 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp); 543 update_domain_attr(dattr, cp);
496
497 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
498 child = cgroup_cs(cont);
499 list_add_tail(&child->stack_list, &q);
500 }
501 } 544 }
545 rcu_read_unlock();
502} 546}
503 547
504/* 548/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
520 * domains when operating in the severe memory shortage situations 564 * domains when operating in the severe memory shortage situations
521 * that could cause allocation failures below. 565 * that could cause allocation failures below.
522 * 566 *
523 * Must be called with cgroup_lock held. 567 * Must be called with cpuset_mutex held.
524 * 568 *
525 * The three key local variables below are: 569 * The three key local variables below are:
526 * q - a linked-list queue of cpuset pointers, used to implement a 570 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
558static int generate_sched_domains(cpumask_var_t **domains, 602static int generate_sched_domains(cpumask_var_t **domains,
559 struct sched_domain_attr **attributes) 603 struct sched_domain_attr **attributes)
560{ 604{
561 LIST_HEAD(q); /* queue of cpusets to be scanned */
562 struct cpuset *cp; /* scans q */ 605 struct cpuset *cp; /* scans q */
563 struct cpuset **csa; /* array of all cpuset ptrs */ 606 struct cpuset **csa; /* array of all cpuset ptrs */
564 int csn; /* how many cpuset ptrs in csa so far */ 607 int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
567 struct sched_domain_attr *dattr; /* attributes for custom domains */ 610 struct sched_domain_attr *dattr; /* attributes for custom domains */
568 int ndoms = 0; /* number of sched domains in result */ 611 int ndoms = 0; /* number of sched domains in result */
569 int nslot; /* next empty doms[] struct cpumask slot */ 612 int nslot; /* next empty doms[] struct cpumask slot */
613 struct cgroup *pos_cgrp;
570 614
571 doms = NULL; 615 doms = NULL;
572 dattr = NULL; 616 dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
594 goto done; 638 goto done;
595 csn = 0; 639 csn = 0;
596 640
597 list_add(&top_cpuset.stack_list, &q); 641 rcu_read_lock();
598 while (!list_empty(&q)) { 642 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
599 struct cgroup *cont;
600 struct cpuset *child; /* scans child cpusets of cp */
601
602 cp = list_first_entry(&q, struct cpuset, stack_list);
603 list_del(q.next);
604
605 if (cpumask_empty(cp->cpus_allowed))
606 continue;
607
608 /* 643 /*
609 * All child cpusets contain a subset of the parent's cpus, so 644 * Continue traversing beyond @cp iff @cp has some CPUs and
610 * just skip them, and then we call update_domain_attr_tree() 645 * isn't load balancing. The former is obvious. The
611 * to calc relax_domain_level of the corresponding sched 646 * latter: All child cpusets contain a subset of the
612 * domain. 647 * parent's cpus, so just skip them, and then we call
648 * update_domain_attr_tree() to calc relax_domain_level of
649 * the corresponding sched domain.
613 */ 650 */
614 if (is_sched_load_balance(cp)) { 651 if (!cpumask_empty(cp->cpus_allowed) &&
615 csa[csn++] = cp; 652 !is_sched_load_balance(cp))
616 continue; 653 continue;
617 }
618 654
619 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 655 if (is_sched_load_balance(cp))
620 child = cgroup_cs(cont); 656 csa[csn++] = cp;
621 list_add_tail(&child->stack_list, &q); 657
622 } 658 /* skip @cp's subtree */
623 } 659 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
660 }
661 rcu_read_unlock();
624 662
625 for (i = 0; i < csn; i++) 663 for (i = 0; i < csn; i++)
626 csa[i]->pn = i; 664 csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
725/* 763/*
726 * Rebuild scheduler domains. 764 * Rebuild scheduler domains.
727 * 765 *
728 * Call with neither cgroup_mutex held nor within get_online_cpus(). 766 * If the flag 'sched_load_balance' of any cpuset with non-empty
729 * Takes both cgroup_mutex and get_online_cpus(). 767 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
768 * which has that flag enabled, or if any cpuset with a non-empty
769 * 'cpus' is removed, then call this routine to rebuild the
770 * scheduler's dynamic sched domains.
730 * 771 *
731 * Cannot be directly called from cpuset code handling changes 772 * Call with cpuset_mutex held. Takes get_online_cpus().
732 * to the cpuset pseudo-filesystem, because it cannot be called
733 * from code that already holds cgroup_mutex.
734 */ 773 */
735static void do_rebuild_sched_domains(struct work_struct *unused) 774static void rebuild_sched_domains_locked(void)
736{ 775{
737 struct sched_domain_attr *attr; 776 struct sched_domain_attr *attr;
738 cpumask_var_t *doms; 777 cpumask_var_t *doms;
739 int ndoms; 778 int ndoms;
740 779
780 lockdep_assert_held(&cpuset_mutex);
741 get_online_cpus(); 781 get_online_cpus();
742 782
743 /* Generate domain masks and attrs */ 783 /* Generate domain masks and attrs */
744 cgroup_lock();
745 ndoms = generate_sched_domains(&doms, &attr); 784 ndoms = generate_sched_domains(&doms, &attr);
746 cgroup_unlock();
747 785
748 /* Have scheduler rebuild the domains */ 786 /* Have scheduler rebuild the domains */
749 partition_sched_domains(ndoms, doms, attr); 787 partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
751 put_online_cpus(); 789 put_online_cpus();
752} 790}
753#else /* !CONFIG_SMP */ 791#else /* !CONFIG_SMP */
754static void do_rebuild_sched_domains(struct work_struct *unused) 792static void rebuild_sched_domains_locked(void)
755{ 793{
756} 794}
757 795
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
763} 801}
764#endif /* CONFIG_SMP */ 802#endif /* CONFIG_SMP */
765 803
766static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
767
768/*
769 * Rebuild scheduler domains, asynchronously via workqueue.
770 *
771 * If the flag 'sched_load_balance' of any cpuset with non-empty
772 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
773 * which has that flag enabled, or if any cpuset with a non-empty
774 * 'cpus' is removed, then call this routine to rebuild the
775 * scheduler's dynamic sched domains.
776 *
777 * The rebuild_sched_domains() and partition_sched_domains()
778 * routines must nest cgroup_lock() inside get_online_cpus(),
779 * but such cpuset changes as these must nest that locking the
780 * other way, holding cgroup_lock() for much of the code.
781 *
782 * So in order to avoid an ABBA deadlock, the cpuset code handling
783 * these user changes delegates the actual sched domain rebuilding
784 * to a separate workqueue thread, which ends up processing the
785 * above do_rebuild_sched_domains() function.
786 */
787static void async_rebuild_sched_domains(void)
788{
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
790}
791
792/*
793 * Accomplishes the same scheduler domain rebuild as the above
794 * async_rebuild_sched_domains(), however it directly calls the
795 * rebuild routine synchronously rather than calling it via an
796 * asynchronous work thread.
797 *
798 * This can only be called from code that is not holding
799 * cgroup_mutex (not nested in a cgroup_lock() call.)
800 */
801void rebuild_sched_domains(void) 804void rebuild_sched_domains(void)
802{ 805{
803 do_rebuild_sched_domains(NULL); 806 mutex_lock(&cpuset_mutex);
807 rebuild_sched_domains_locked();
808 mutex_unlock(&cpuset_mutex);
804} 809}
805 810
806/** 811/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
808 * @tsk: task to test 813 * @tsk: task to test
809 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 814 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
810 * 815 *
811 * Call with cgroup_mutex held. May take callback_mutex during call. 816 * Call with cpuset_mutex held. May take callback_mutex during call.
812 * Called for each task in a cgroup by cgroup_scan_tasks(). 817 * Called for each task in a cgroup by cgroup_scan_tasks().
813 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 818 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
814 * words, if its mask is not equal to its cpuset's mask). 819 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829 * cpus_allowed mask needs to be changed. 834 * cpus_allowed mask needs to be changed.
830 * 835 *
831 * We don't need to re-check for the cgroup/cpuset membership, since we're 836 * We don't need to re-check for the cgroup/cpuset membership, since we're
832 * holding cgroup_lock() at this point. 837 * holding cpuset_mutex at this point.
833 */ 838 */
834static void cpuset_change_cpumask(struct task_struct *tsk, 839static void cpuset_change_cpumask(struct task_struct *tsk,
835 struct cgroup_scanner *scan) 840 struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
842 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 847 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
843 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 848 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
844 * 849 *
845 * Called with cgroup_mutex held 850 * Called with cpuset_mutex held
846 * 851 *
847 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 852 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
848 * calling callback functions for each. 853 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 heap_free(&heap); 925 heap_free(&heap);
921 926
922 if (is_load_balanced) 927 if (is_load_balanced)
923 async_rebuild_sched_domains(); 928 rebuild_sched_domains_locked();
924 return 0; 929 return 0;
925} 930}
926 931
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
932 * Temporarilly set tasks mems_allowed to target nodes of migration, 937 * Temporarilly set tasks mems_allowed to target nodes of migration,
933 * so that the migration code can allocate pages on these nodes. 938 * so that the migration code can allocate pages on these nodes.
934 * 939 *
935 * Call holding cgroup_mutex, so current's cpuset won't change 940 * Call holding cpuset_mutex, so current's cpuset won't change
936 * during this call, as manage_mutex holds off any cpuset_attach() 941 * during this call, as manage_mutex holds off any cpuset_attach()
937 * calls. Therefore we don't need to take task_lock around the 942 * calls. Therefore we don't need to take task_lock around the
938 * call to guarantee_online_mems(), as we know no one is changing 943 * call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007/* 1012/*
1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1013 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1014 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1010 * memory_migrate flag is set. Called with cgroup_mutex held. 1015 * memory_migrate flag is set. Called with cpuset_mutex held.
1011 */ 1016 */
1012static void cpuset_change_nodemask(struct task_struct *p, 1017static void cpuset_change_nodemask(struct task_struct *p,
1013 struct cgroup_scanner *scan) 1018 struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1021 struct cpuset *cs;
1017 int migrate; 1022 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1023 const nodemask_t *oldmem = scan->data;
1019 static nodemask_t newmems; /* protected by cgroup_mutex */ 1024 static nodemask_t newmems; /* protected by cpuset_mutex */
1020 1025
1021 cs = cgroup_cs(scan->cg); 1026 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems); 1027 guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
1043 * @oldmem: old mems_allowed of cpuset cs 1048 * @oldmem: old mems_allowed of cpuset cs
1044 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1049 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1045 * 1050 *
1046 * Called with cgroup_mutex held 1051 * Called with cpuset_mutex held
1047 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1052 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1048 * if @heap != NULL. 1053 * if @heap != NULL.
1049 */ 1054 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1065 * take while holding tasklist_lock. Forks can happen - the 1070 * take while holding tasklist_lock. Forks can happen - the
1066 * mpol_dup() cpuset_being_rebound check will catch such forks, 1071 * mpol_dup() cpuset_being_rebound check will catch such forks,
1067 * and rebind their vma mempolicies too. Because we still hold 1072 * and rebind their vma mempolicies too. Because we still hold
1068 * the global cgroup_mutex, we know that no other rebind effort 1073 * the global cpuset_mutex, we know that no other rebind effort
1069 * will be contending for the global variable cpuset_being_rebound. 1074 * will be contending for the global variable cpuset_being_rebound.
1070 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1075 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1071 * is idempotent. Also migrate pages in each mm to new nodes. 1076 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1084 * mempolicies and if the cpuset is marked 'memory_migrate', 1089 * mempolicies and if the cpuset is marked 'memory_migrate',
1085 * migrate the tasks pages to the new memory. 1090 * migrate the tasks pages to the new memory.
1086 * 1091 *
1087 * Call with cgroup_mutex held. May take callback_mutex during call. 1092 * Call with cpuset_mutex held. May take callback_mutex during call.
1088 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1093 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1089 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1094 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1090 * their mempolicies to the cpusets new mems_allowed. 1095 * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1168 cs->relax_domain_level = val; 1173 cs->relax_domain_level = val;
1169 if (!cpumask_empty(cs->cpus_allowed) && 1174 if (!cpumask_empty(cs->cpus_allowed) &&
1170 is_sched_load_balance(cs)) 1175 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains(); 1176 rebuild_sched_domains_locked();
1172 } 1177 }
1173 1178
1174 return 0; 1179 return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182 * Called by cgroup_scan_tasks() for each task in a cgroup. 1187 * Called by cgroup_scan_tasks() for each task in a cgroup.
1183 * 1188 *
1184 * We don't need to re-check for the cgroup/cpuset membership, since we're 1189 * We don't need to re-check for the cgroup/cpuset membership, since we're
1185 * holding cgroup_lock() at this point. 1190 * holding cpuset_mutex at this point.
1186 */ 1191 */
1187static void cpuset_change_flag(struct task_struct *tsk, 1192static void cpuset_change_flag(struct task_struct *tsk,
1188 struct cgroup_scanner *scan) 1193 struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
1195 * @cs: the cpuset in which each task's spread flags needs to be changed 1200 * @cs: the cpuset in which each task's spread flags needs to be changed
1196 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1201 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1197 * 1202 *
1198 * Called with cgroup_mutex held 1203 * Called with cpuset_mutex held
1199 * 1204 *
1200 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1205 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1201 * calling callback functions for each. 1206 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1220 * cs: the cpuset to update 1225 * cs: the cpuset to update
1221 * turning_on: whether the flag is being set or cleared 1226 * turning_on: whether the flag is being set or cleared
1222 * 1227 *
1223 * Call with cgroup_mutex held. 1228 * Call with cpuset_mutex held.
1224 */ 1229 */
1225 1230
1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1231static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1260 mutex_unlock(&callback_mutex); 1265 mutex_unlock(&callback_mutex);
1261 1266
1262 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1267 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains(); 1268 rebuild_sched_domains_locked();
1264 1269
1265 if (spread_flag_changed) 1270 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap); 1271 update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
1368 return val; 1373 return val;
1369} 1374}
1370 1375
1371/* 1376/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1372 * Protected by cgroup_lock. The nodemasks must be stored globally because
1373 * dynamically allocating them is not allowed in can_attach, and they must
1374 * persist until attach.
1375 */
1376static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to;
1379
1380/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1377static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1382{ 1378{
1383 struct cpuset *cs = cgroup_cs(cgrp); 1379 struct cpuset *cs = cgroup_cs(cgrp);
1384 struct task_struct *task; 1380 struct task_struct *task;
1385 int ret; 1381 int ret;
1386 1382
1383 mutex_lock(&cpuset_mutex);
1384
1385 ret = -ENOSPC;
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC; 1387 goto out_unlock;
1389 1388
1390 cgroup_taskset_for_each(task, cgrp, tset) { 1389 cgroup_taskset_for_each(task, cgrp, tset) {
1391 /* 1390 /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1397 * set_cpus_allowed_ptr() on all attached tasks before 1396 * set_cpus_allowed_ptr() on all attached tasks before
1398 * cpus_allowed may be changed. 1397 * cpus_allowed may be changed.
1399 */ 1398 */
1399 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL; 1401 goto out_unlock;
1402 if ((ret = security_task_setscheduler(task))) 1402 ret = security_task_setscheduler(task);
1403 return ret; 1403 if (ret)
1404 goto out_unlock;
1404 } 1405 }
1405 1406
1406 /* prepare for attach */ 1407 /*
1407 if (cs == &top_cpuset) 1408 * Mark attach is in progress. This makes validate_change() fail
1408 cpumask_copy(cpus_attach, cpu_possible_mask); 1409 * changes which zero cpus/mems_allowed.
1409 else 1410 */
1410 guarantee_online_cpus(cs, cpus_attach); 1411 cs->attach_in_progress++;
1411 1412 ret = 0;
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1413out_unlock:
1414 mutex_unlock(&cpuset_mutex);
1415 return ret;
1416}
1413 1417
1414 return 0; 1418static void cpuset_cancel_attach(struct cgroup *cgrp,
1419 struct cgroup_taskset *tset)
1420{
1421 mutex_lock(&cpuset_mutex);
1422 cgroup_cs(cgrp)->attach_in_progress--;
1423 mutex_unlock(&cpuset_mutex);
1415} 1424}
1416 1425
1426/*
1427 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
1428 * but we can't allocate it dynamically there. Define it global and
1429 * allocate from cpuset_init().
1430 */
1431static cpumask_var_t cpus_attach;
1432
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1433static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1418{ 1434{
1435 /* static bufs protected by cpuset_mutex */
1436 static nodemask_t cpuset_attach_nodemask_from;
1437 static nodemask_t cpuset_attach_nodemask_to;
1419 struct mm_struct *mm; 1438 struct mm_struct *mm;
1420 struct task_struct *task; 1439 struct task_struct *task;
1421 struct task_struct *leader = cgroup_taskset_first(tset); 1440 struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1423 struct cpuset *cs = cgroup_cs(cgrp); 1442 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1443 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425 1444
1445 mutex_lock(&cpuset_mutex);
1446
1447 /* prepare for attach */
1448 if (cs == &top_cpuset)
1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1450 else
1451 guarantee_online_cpus(cs, cpus_attach);
1452
1453 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1454
1426 cgroup_taskset_for_each(task, cgrp, tset) { 1455 cgroup_taskset_for_each(task, cgrp, tset) {
1427 /* 1456 /*
1428 * can_attach beforehand should guarantee that this doesn't 1457 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1448 &cpuset_attach_nodemask_to); 1477 &cpuset_attach_nodemask_to);
1449 mmput(mm); 1478 mmput(mm);
1450 } 1479 }
1480
1481 cs->attach_in_progress--;
1482
1483 /*
1484 * We may have raced with CPU/memory hotunplug. Trigger hotplug
1485 * propagation if @cs doesn't have any CPU or memory. It will move
1486 * the newly added tasks to the nearest parent which can execute.
1487 */
1488 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1489 schedule_cpuset_propagate_hotplug(cs);
1490
1491 mutex_unlock(&cpuset_mutex);
1451} 1492}
1452 1493
1453/* The various types of files and directories in a cpuset file system */ 1494/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
1469 1510
1470static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1511static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1471{ 1512{
1472 int retval = 0;
1473 struct cpuset *cs = cgroup_cs(cgrp); 1513 struct cpuset *cs = cgroup_cs(cgrp);
1474 cpuset_filetype_t type = cft->private; 1514 cpuset_filetype_t type = cft->private;
1515 int retval = -ENODEV;
1475 1516
1476 if (!cgroup_lock_live_group(cgrp)) 1517 mutex_lock(&cpuset_mutex);
1477 return -ENODEV; 1518 if (!is_cpuset_online(cs))
1519 goto out_unlock;
1478 1520
1479 switch (type) { 1521 switch (type) {
1480 case FILE_CPU_EXCLUSIVE: 1522 case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1508 retval = -EINVAL; 1550 retval = -EINVAL;
1509 break; 1551 break;
1510 } 1552 }
1511 cgroup_unlock(); 1553out_unlock:
1554 mutex_unlock(&cpuset_mutex);
1512 return retval; 1555 return retval;
1513} 1556}
1514 1557
1515static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1558static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1516{ 1559{
1517 int retval = 0;
1518 struct cpuset *cs = cgroup_cs(cgrp); 1560 struct cpuset *cs = cgroup_cs(cgrp);
1519 cpuset_filetype_t type = cft->private; 1561 cpuset_filetype_t type = cft->private;
1562 int retval = -ENODEV;
1520 1563
1521 if (!cgroup_lock_live_group(cgrp)) 1564 mutex_lock(&cpuset_mutex);
1522 return -ENODEV; 1565 if (!is_cpuset_online(cs))
1566 goto out_unlock;
1523 1567
1524 switch (type) { 1568 switch (type) {
1525 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1569 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1529 retval = -EINVAL; 1573 retval = -EINVAL;
1530 break; 1574 break;
1531 } 1575 }
1532 cgroup_unlock(); 1576out_unlock:
1577 mutex_unlock(&cpuset_mutex);
1533 return retval; 1578 return retval;
1534} 1579}
1535 1580
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1539static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1584static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1540 const char *buf) 1585 const char *buf)
1541{ 1586{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp); 1587 struct cpuset *cs = cgroup_cs(cgrp);
1544 struct cpuset *trialcs; 1588 struct cpuset *trialcs;
1589 int retval = -ENODEV;
1590
1591 /*
1592 * CPU or memory hotunplug may leave @cs w/o any execution
1593 * resources, in which case the hotplug code asynchronously updates
1594 * configuration and transfers all tasks to the nearest ancestor
1595 * which can execute.
1596 *
1597 * As writes to "cpus" or "mems" may restore @cs's execution
1598 * resources, wait for the previously scheduled operations before
1599 * proceeding, so that we don't end up keep removing tasks added
1600 * after execution capability is restored.
1601 *
1602 * Flushing cpuset_hotplug_work is enough to synchronize against
1603 * hotplug hanlding; however, cpuset_attach() may schedule
1604 * propagation work directly. Flush the workqueue too.
1605 */
1606 flush_work(&cpuset_hotplug_work);
1607 flush_workqueue(cpuset_propagate_hotplug_wq);
1545 1608
1546 if (!cgroup_lock_live_group(cgrp)) 1609 mutex_lock(&cpuset_mutex);
1547 return -ENODEV; 1610 if (!is_cpuset_online(cs))
1611 goto out_unlock;
1548 1612
1549 trialcs = alloc_trial_cpuset(cs); 1613 trialcs = alloc_trial_cpuset(cs);
1550 if (!trialcs) { 1614 if (!trialcs) {
1551 retval = -ENOMEM; 1615 retval = -ENOMEM;
1552 goto out; 1616 goto out_unlock;
1553 } 1617 }
1554 1618
1555 switch (cft->private) { 1619 switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 } 1629 }
1566 1630
1567 free_trial_cpuset(trialcs); 1631 free_trial_cpuset(trialcs);
1568out: 1632out_unlock:
1569 cgroup_unlock(); 1633 mutex_unlock(&cpuset_mutex);
1570 return retval; 1634 return retval;
1571} 1635}
1572 1636
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
1790 1854
1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1855static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1792{ 1856{
1793 struct cgroup *parent_cg = cont->parent; 1857 struct cpuset *cs;
1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1796 1858
1797 if (!parent_cg) 1859 if (!cont->parent)
1798 return &top_cpuset.css; 1860 return &top_cpuset.css;
1799 parent = cgroup_cs(parent_cg);
1800 1861
1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1862 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1802 if (!cs) 1863 if (!cs)
1803 return ERR_PTR(-ENOMEM); 1864 return ERR_PTR(-ENOMEM);
1804 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1865 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1806 return ERR_PTR(-ENOMEM); 1867 return ERR_PTR(-ENOMEM);
1807 } 1868 }
1808 1869
1809 cs->flags = 0;
1810 if (is_spread_page(parent))
1811 set_bit(CS_SPREAD_PAGE, &cs->flags);
1812 if (is_spread_slab(parent))
1813 set_bit(CS_SPREAD_SLAB, &cs->flags);
1814 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1815 cpumask_clear(cs->cpus_allowed); 1871 cpumask_clear(cs->cpus_allowed);
1816 nodes_clear(cs->mems_allowed); 1872 nodes_clear(cs->mems_allowed);
1817 fmeter_init(&cs->fmeter); 1873 fmeter_init(&cs->fmeter);
1874 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1818 cs->relax_domain_level = -1; 1875 cs->relax_domain_level = -1;
1819 1876
1820 cs->parent = parent; 1877 return &cs->css;
1878}
1879
1880static int cpuset_css_online(struct cgroup *cgrp)
1881{
1882 struct cpuset *cs = cgroup_cs(cgrp);
1883 struct cpuset *parent = parent_cs(cs);
1884 struct cpuset *tmp_cs;
1885 struct cgroup *pos_cg;
1886
1887 if (!parent)
1888 return 0;
1889
1890 mutex_lock(&cpuset_mutex);
1891
1892 set_bit(CS_ONLINE, &cs->flags);
1893 if (is_spread_page(parent))
1894 set_bit(CS_SPREAD_PAGE, &cs->flags);
1895 if (is_spread_slab(parent))
1896 set_bit(CS_SPREAD_SLAB, &cs->flags);
1897
1821 number_of_cpusets++; 1898 number_of_cpusets++;
1822 1899
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) 1900 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1824 goto skip_clone; 1901 goto out_unlock;
1825 1902
1826 /* 1903 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 1904 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1913 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup. 1914 * (and likewise for mems) to the new cgroup.
1838 */ 1915 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { 1916 rcu_read_lock();
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg); 1917 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
1841 1918 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) 1919 rcu_read_unlock();
1843 goto skip_clone; 1920 goto out_unlock;
1921 }
1844 } 1922 }
1923 rcu_read_unlock();
1845 1924
1846 mutex_lock(&callback_mutex); 1925 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed; 1926 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1927 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex); 1928 mutex_unlock(&callback_mutex);
1850skip_clone: 1929out_unlock:
1851 return &cs->css; 1930 mutex_unlock(&cpuset_mutex);
1931 return 0;
1932}
1933
1934static void cpuset_css_offline(struct cgroup *cgrp)
1935{
1936 struct cpuset *cs = cgroup_cs(cgrp);
1937
1938 mutex_lock(&cpuset_mutex);
1939
1940 if (is_sched_load_balance(cs))
1941 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1942
1943 number_of_cpusets--;
1944 clear_bit(CS_ONLINE, &cs->flags);
1945
1946 mutex_unlock(&cpuset_mutex);
1852} 1947}
1853 1948
1854/* 1949/*
1855 * If the cpuset being removed has its flag 'sched_load_balance' 1950 * If the cpuset being removed has its flag 'sched_load_balance'
1856 * enabled, then simulate turning sched_load_balance off, which 1951 * enabled, then simulate turning sched_load_balance off, which
1857 * will call async_rebuild_sched_domains(). 1952 * will call rebuild_sched_domains_locked().
1858 */ 1953 */
1859 1954
1860static void cpuset_css_free(struct cgroup *cont) 1955static void cpuset_css_free(struct cgroup *cont)
1861{ 1956{
1862 struct cpuset *cs = cgroup_cs(cont); 1957 struct cpuset *cs = cgroup_cs(cont);
1863 1958
1864 if (is_sched_load_balance(cs))
1865 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1866
1867 number_of_cpusets--;
1868 free_cpumask_var(cs->cpus_allowed); 1959 free_cpumask_var(cs->cpus_allowed);
1869 kfree(cs); 1960 kfree(cs);
1870} 1961}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
1872struct cgroup_subsys cpuset_subsys = { 1963struct cgroup_subsys cpuset_subsys = {
1873 .name = "cpuset", 1964 .name = "cpuset",
1874 .css_alloc = cpuset_css_alloc, 1965 .css_alloc = cpuset_css_alloc,
1966 .css_online = cpuset_css_online,
1967 .css_offline = cpuset_css_offline,
1875 .css_free = cpuset_css_free, 1968 .css_free = cpuset_css_free,
1876 .can_attach = cpuset_can_attach, 1969 .can_attach = cpuset_can_attach,
1970 .cancel_attach = cpuset_cancel_attach,
1877 .attach = cpuset_attach, 1971 .attach = cpuset_attach,
1878 .subsys_id = cpuset_subsys_id, 1972 .subsys_id = cpuset_subsys_id,
1879 .base_cftypes = files, 1973 .base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1924{ 2018{
1925 struct cgroup *new_cgroup = scan->data; 2019 struct cgroup *new_cgroup = scan->data;
1926 2020
2021 cgroup_lock();
1927 cgroup_attach_task(new_cgroup, tsk); 2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
1928} 2024}
1929 2025
1930/** 2026/**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 * @from: cpuset in which the tasks currently reside 2028 * @from: cpuset in which the tasks currently reside
1933 * @to: cpuset to which the tasks will be moved 2029 * @to: cpuset to which the tasks will be moved
1934 * 2030 *
1935 * Called with cgroup_mutex held 2031 * Called with cpuset_mutex held
1936 * callback_mutex must not be held, as cpuset_attach() will take it. 2032 * callback_mutex must not be held, as cpuset_attach() will take it.
1937 * 2033 *
1938 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1959 * removing that CPU or node from all cpusets. If this removes the 2055 * removing that CPU or node from all cpusets. If this removes the
1960 * last CPU or node from a cpuset, then move the tasks in the empty 2056 * last CPU or node from a cpuset, then move the tasks in the empty
1961 * cpuset to its next-highest non-empty parent. 2057 * cpuset to its next-highest non-empty parent.
1962 *
1963 * Called with cgroup_mutex held
1964 * callback_mutex must not be held, as cpuset_attach() will take it.
1965 */ 2058 */
1966static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2059static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1967{ 2060{
1968 struct cpuset *parent; 2061 struct cpuset *parent;
1969 2062
1970 /* 2063 /*
1971 * The cgroup's css_sets list is in use if there are tasks
1972 * in the cpuset; the list is empty if there are none;
1973 * the cs->css.refcnt seems always 0.
1974 */
1975 if (list_empty(&cs->css.cgroup->css_sets))
1976 return;
1977
1978 /*
1979 * Find its next-highest non-empty parent, (top cpuset 2064 * Find its next-highest non-empty parent, (top cpuset
1980 * has online cpus, so can't be empty). 2065 * has online cpus, so can't be empty).
1981 */ 2066 */
1982 parent = cs->parent; 2067 parent = parent_cs(cs);
1983 while (cpumask_empty(parent->cpus_allowed) || 2068 while (cpumask_empty(parent->cpus_allowed) ||
1984 nodes_empty(parent->mems_allowed)) 2069 nodes_empty(parent->mems_allowed))
1985 parent = parent->parent; 2070 parent = parent_cs(parent);
1986 2071
1987 move_member_tasks_to_cpuset(cs, parent); 2072 move_member_tasks_to_cpuset(cs, parent);
1988} 2073}
1989 2074
1990/* 2075/**
1991 * Helper function to traverse cpusets. 2076 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
1992 * It can be used to walk the cpuset tree from top to bottom, completing 2077 * @cs: cpuset in interest
1993 * one layer before dropping down to the next (thus always processing a 2078 *
1994 * node before any of its children). 2079 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2080 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2081 * all its tasks are moved to the nearest ancestor with both resources.
1995 */ 2082 */
1996static struct cpuset *cpuset_next(struct list_head *queue) 2083static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
1997{ 2084{
1998 struct cpuset *cp; 2085 static cpumask_t off_cpus;
1999 struct cpuset *child; /* scans child cpusets of cp */ 2086 static nodemask_t off_mems, tmp_mems;
2000 struct cgroup *cont; 2087 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2088 bool is_empty;
2001 2089
2002 if (list_empty(queue)) 2090 mutex_lock(&cpuset_mutex);
2003 return NULL; 2091
2092 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2093 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2004 2094
2005 cp = list_first_entry(queue, struct cpuset, stack_list); 2095 /* remove offline cpus from @cs */
2006 list_del(queue->next); 2096 if (!cpumask_empty(&off_cpus)) {
2007 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2097 mutex_lock(&callback_mutex);
2008 child = cgroup_cs(cont); 2098 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2009 list_add_tail(&child->stack_list, queue); 2099 mutex_unlock(&callback_mutex);
2100 update_tasks_cpumask(cs, NULL);
2101 }
2102
2103 /* remove offline mems from @cs */
2104 if (!nodes_empty(off_mems)) {
2105 tmp_mems = cs->mems_allowed;
2106 mutex_lock(&callback_mutex);
2107 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2108 mutex_unlock(&callback_mutex);
2109 update_tasks_nodemask(cs, &tmp_mems, NULL);
2010 } 2110 }
2011 2111
2012 return cp; 2112 is_empty = cpumask_empty(cs->cpus_allowed) ||
2113 nodes_empty(cs->mems_allowed);
2114
2115 mutex_unlock(&cpuset_mutex);
2116
2117 /*
2118 * If @cs became empty, move tasks to the nearest ancestor with
2119 * execution resources. This is full cgroup operation which will
2120 * also call back into cpuset. Should be done outside any lock.
2121 */
2122 if (is_empty)
2123 remove_tasks_in_empty_cpuset(cs);
2124
2125 /* the following may free @cs, should be the last operation */
2126 css_put(&cs->css);
2013} 2127}
2014 2128
2129/**
2130 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2131 * @cs: cpuset of interest
2132 *
2133 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2134 * memory masks according to top_cpuset.
2135 */
2136static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2137{
2138 /*
2139 * Pin @cs. The refcnt will be released when the work item
2140 * finishes executing.
2141 */
2142 if (!css_tryget(&cs->css))
2143 return;
2015 2144
2016/* 2145 /*
2017 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory 2146 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2018 * online/offline) and update the cpusets accordingly. 2147 * cpuset_propagate_hotplug_wq is ordered and propagation will
2019 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such 2148 * happen in the order this function is called.
2020 * cpuset must be moved to a parent cpuset. 2149 */
2150 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2151 css_put(&cs->css);
2152}
2153
2154/**
2155 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2021 * 2156 *
2022 * Called with cgroup_mutex held. We take callback_mutex to modify 2157 * This function is called after either CPU or memory configuration has
2023 * cpus_allowed and mems_allowed. 2158 * changed and updates cpuset accordingly. The top_cpuset is always
2159 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2160 * order to make cpusets transparent (of no affect) on systems that are
2161 * actively using CPU hotplug but making no active use of cpusets.
2024 * 2162 *
2025 * This walk processes the tree from top to bottom, completing one layer 2163 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2026 * before dropping down to the next. It always processes a node before 2164 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
2027 * any of its children. 2165 * descendants.
2028 * 2166 *
2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY 2167 * Note that CPU offlining during suspend is ignored. We don't modify
2030 * if all present pages from a node are offlined. 2168 * cpusets across suspend/resume cycles at all.
2031 */ 2169 */
2032static void 2170static void cpuset_hotplug_workfn(struct work_struct *work)
2033scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2034{ 2171{
2035 LIST_HEAD(queue); 2172 static cpumask_t new_cpus, tmp_cpus;
2036 struct cpuset *cp; /* scans cpusets being updated */ 2173 static nodemask_t new_mems, tmp_mems;
2037 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2174 bool cpus_updated, mems_updated;
2175 bool cpus_offlined, mems_offlined;
2038 2176
2039 list_add_tail((struct list_head *)&root->stack_list, &queue); 2177 mutex_lock(&cpuset_mutex);
2040 2178
2041 switch (event) { 2179 /* fetch the available cpus/mems and find out which changed how */
2042 case CPUSET_CPU_OFFLINE: 2180 cpumask_copy(&new_cpus, cpu_active_mask);
2043 while ((cp = cpuset_next(&queue)) != NULL) { 2181 new_mems = node_states[N_MEMORY];
2044 2182
2045 /* Continue past cpusets with all cpus online */ 2183 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2046 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) 2184 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2047 continue; 2185 &new_cpus);
2048 2186
2049 /* Remove offline cpus from this cpuset. */ 2187 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2050 mutex_lock(&callback_mutex); 2188 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2051 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2189 mems_offlined = !nodes_empty(tmp_mems);
2052 cpu_active_mask);
2053 mutex_unlock(&callback_mutex);
2054 2190
2055 /* Move tasks from the empty cpuset to a parent */ 2191 /* synchronize cpus_allowed to cpu_active_mask */
2056 if (cpumask_empty(cp->cpus_allowed)) 2192 if (cpus_updated) {
2057 remove_tasks_in_empty_cpuset(cp); 2193 mutex_lock(&callback_mutex);
2058 else 2194 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2059 update_tasks_cpumask(cp, NULL); 2195 mutex_unlock(&callback_mutex);
2060 } 2196 /* we don't mess with cpumasks of tasks in top_cpuset */
2061 break; 2197 }
2062 2198
2063 case CPUSET_MEM_OFFLINE: 2199 /* synchronize mems_allowed to N_MEMORY */
2064 while ((cp = cpuset_next(&queue)) != NULL) { 2200 if (mems_updated) {
2201 tmp_mems = top_cpuset.mems_allowed;
2202 mutex_lock(&callback_mutex);
2203 top_cpuset.mems_allowed = new_mems;
2204 mutex_unlock(&callback_mutex);
2205 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
2206 }
2065 2207
2066 /* Continue past cpusets with all mems online */ 2208 /* if cpus or mems went down, we need to propagate to descendants */
2067 if (nodes_subset(cp->mems_allowed, 2209 if (cpus_offlined || mems_offlined) {
2068 node_states[N_MEMORY])) 2210 struct cpuset *cs;
2069 continue; 2211 struct cgroup *pos_cgrp;
2070 2212
2071 oldmems = cp->mems_allowed; 2213 rcu_read_lock();
2214 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
2215 schedule_cpuset_propagate_hotplug(cs);
2216 rcu_read_unlock();
2217 }
2072 2218
2073 /* Remove offline mems from this cpuset. */ 2219 mutex_unlock(&cpuset_mutex);
2074 mutex_lock(&callback_mutex);
2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2076 node_states[N_MEMORY]);
2077 mutex_unlock(&callback_mutex);
2078 2220
2079 /* Move tasks from the empty cpuset to a parent */ 2221 /* wait for propagations to finish */
2080 if (nodes_empty(cp->mems_allowed)) 2222 flush_workqueue(cpuset_propagate_hotplug_wq);
2081 remove_tasks_in_empty_cpuset(cp); 2223
2082 else 2224 /* rebuild sched domains if cpus_allowed has changed */
2083 update_tasks_nodemask(cp, &oldmems, NULL); 2225 if (cpus_updated) {
2084 } 2226 struct sched_domain_attr *attr;
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2085 } 2235 }
2086} 2236}
2087 2237
2088/*
2089 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2090 * period. This is necessary in order to make cpusets transparent
2091 * (of no affect) on systems that are actively using CPU hotplug
2092 * but making no active use of cpusets.
2093 *
2094 * The only exception to this is suspend/resume, where we don't
2095 * modify cpusets at all.
2096 *
2097 * This routine ensures that top_cpuset.cpus_allowed tracks
2098 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2099 *
2100 * Called within get_online_cpus(). Needs to call cgroup_lock()
2101 * before calling generate_sched_domains().
2102 *
2103 * @cpu_online: Indicates whether this is a CPU online event (true) or
2104 * a CPU offline event (false).
2105 */
2106void cpuset_update_active_cpus(bool cpu_online) 2238void cpuset_update_active_cpus(bool cpu_online)
2107{ 2239{
2108 struct sched_domain_attr *attr; 2240 /*
2109 cpumask_var_t *doms; 2241 * We're inside cpu hotplug critical region which usually nests
2110 int ndoms; 2242 * inside cgroup synchronization. Bounce actual hotplug processing
2111 2243 * to a work item to avoid reverse locking order.
2112 cgroup_lock(); 2244 *
2113 mutex_lock(&callback_mutex); 2245 * We still need to do partition_sched_domains() synchronously;
2114 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2246 * otherwise, the scheduler will get confused and put tasks to the
2115 mutex_unlock(&callback_mutex); 2247 * dead CPU. Fall back to the default single domain.
2116 2248 * cpuset_hotplug_workfn() will rebuild it as necessary.
2117 if (!cpu_online) 2249 */
2118 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); 2250 partition_sched_domains(1, NULL, NULL);
2119 2251 schedule_work(&cpuset_hotplug_work);
2120 ndoms = generate_sched_domains(&doms, &attr);
2121 cgroup_unlock();
2122
2123 /* Have scheduler rebuild the domains */
2124 partition_sched_domains(ndoms, doms, attr);
2125} 2252}
2126 2253
2127#ifdef CONFIG_MEMORY_HOTPLUG 2254#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2133static int cpuset_track_online_nodes(struct notifier_block *self, 2260static int cpuset_track_online_nodes(struct notifier_block *self,
2134 unsigned long action, void *arg) 2261 unsigned long action, void *arg)
2135{ 2262{
2136 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2263 schedule_work(&cpuset_hotplug_work);
2137
2138 cgroup_lock();
2139 switch (action) {
2140 case MEM_ONLINE:
2141 oldmems = top_cpuset.mems_allowed;
2142 mutex_lock(&callback_mutex);
2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2144 mutex_unlock(&callback_mutex);
2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2146 break;
2147 case MEM_OFFLINE:
2148 /*
2149 * needn't update top_cpuset.mems_allowed explicitly because
2150 * scan_cpusets_upon_hotplug() will update it.
2151 */
2152 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2153 break;
2154 default:
2155 break;
2156 }
2157 cgroup_unlock();
2158
2159 return NOTIFY_OK; 2264 return NOTIFY_OK;
2160} 2265}
2161#endif 2266#endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
2173 2278
2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2175 2280
2176 cpuset_wq = create_singlethread_workqueue("cpuset"); 2281 cpuset_propagate_hotplug_wq =
2177 BUG_ON(!cpuset_wq); 2282 alloc_ordered_workqueue("cpuset_hotplug", 0);
2283 BUG_ON(!cpuset_propagate_hotplug_wq);
2178} 2284}
2179 2285
2180/** 2286/**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2273 */ 2379 */
2274static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2380static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2275{ 2381{
2276 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) 2382 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2277 cs = cs->parent; 2383 cs = parent_cs(cs);
2278 return cs; 2384 return cs;
2279} 2385}
2280 2386
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2412} 2518}
2413 2519
2414/** 2520/**
2415 * cpuset_unlock - release lock on cpuset changes
2416 *
2417 * Undo the lock taken in a previous cpuset_lock() call.
2418 */
2419
2420void cpuset_unlock(void)
2421{
2422 mutex_unlock(&callback_mutex);
2423}
2424
2425/**
2426 * cpuset_mem_spread_node() - On which node to begin search for a file page 2521 * cpuset_mem_spread_node() - On which node to begin search for a file page
2427 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2522 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2428 * 2523 *
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2511 2606
2512 dentry = task_cs(tsk)->css.cgroup->dentry; 2607 dentry = task_cs(tsk)->css.cgroup->dentry;
2513 spin_lock(&cpuset_buffer_lock); 2608 spin_lock(&cpuset_buffer_lock);
2514 snprintf(cpuset_name, CPUSET_NAME_LEN, 2609
2515 dentry ? (const char *)dentry->d_name.name : "/"); 2610 if (!dentry) {
2611 strcpy(cpuset_name, "/");
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618
2516 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2517 tsk->mems_allowed); 2620 tsk->mems_allowed);
2518 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
2560 * - Used for /proc/<pid>/cpuset. 2663 * - Used for /proc/<pid>/cpuset.
2561 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2664 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2562 * doesn't really matter if tsk->cpuset changes after we read it, 2665 * doesn't really matter if tsk->cpuset changes after we read it,
2563 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2564 * anyway. 2667 * anyway.
2565 */ 2668 */
2566static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2669static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2582 if (!tsk) 2685 if (!tsk)
2583 goto out_free; 2686 goto out_free;
2584 2687
2585 retval = -EINVAL; 2688 rcu_read_lock();
2586 cgroup_lock();
2587 css = task_subsys_state(tsk, cpuset_subsys_id); 2689 css = task_subsys_state(tsk, cpuset_subsys_id);
2588 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2690 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2691 rcu_read_unlock();
2589 if (retval < 0) 2692 if (retval < 0)
2590 goto out_unlock; 2693 goto out_put_task;
2591 seq_puts(m, buf); 2694 seq_puts(m, buf);
2592 seq_putc(m, '\n'); 2695 seq_putc(m, '\n');
2593out_unlock: 2696out_put_task:
2594 cgroup_unlock();
2595 put_task_struct(tsk); 2697 put_task_struct(tsk);
2596out_free: 2698out_free:
2597 kfree(buf); 2699 kfree(buf);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
106 unsigned long long t2, t3; 106 unsigned long long t2, t3;
107 unsigned long flags; 107 unsigned long flags;
108 struct timespec ts; 108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled;
109 110
110 /* Though tsk->delays accessed later, early exit avoids 111 /* Though tsk->delays accessed later, early exit avoids
111 * unnecessary returning of other data 112 * unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
114 goto done; 115 goto done;
115 116
116 tmp = (s64)d->cpu_run_real_total; 117 tmp = (s64)d->cpu_run_real_total;
117 cputime_to_timespec(tsk->utime + tsk->stime, &ts); 118 task_cputime(tsk, &utime, &stime);
119 cputime_to_timespec(utime + stime, &ts);
118 tmp += timespec_to_ns(&ts); 120 tmp += timespec_to_ns(&ts);
119 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 121 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
120 122
121 tmp = (s64)d->cpu_scaled_run_real_total; 123 tmp = (s64)d->cpu_scaled_run_real_total;
122 cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); 124 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
125 cputime_to_timespec(utimescaled + stimescaled, &ts);
123 tmp += timespec_to_ns(&ts); 126 tmp += timespec_to_ns(&ts);
124 d->cpu_scaled_run_real_total = 127 d->cpu_scaled_run_real_total =
125 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; 128 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a8c067..5c75791d7269 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6171,11 +6171,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6171 6171
6172 if (task) { 6172 if (task) {
6173 event->attach_state = PERF_ATTACH_TASK; 6173 event->attach_state = PERF_ATTACH_TASK;
6174
6175 if (attr->type == PERF_TYPE_TRACEPOINT)
6176 event->hw.tp_target = task;
6174#ifdef CONFIG_HAVE_HW_BREAKPOINT 6177#ifdef CONFIG_HAVE_HW_BREAKPOINT
6175 /* 6178 /*
6176 * hw_breakpoint is a bit difficult here.. 6179 * hw_breakpoint is a bit difficult here..
6177 */ 6180 */
6178 if (attr->type == PERF_TYPE_BREAKPOINT) 6181 else if (attr->type == PERF_TYPE_BREAKPOINT)
6179 event->hw.bp_target = task; 6182 event->hw.bp_target = task;
6180#endif 6183#endif
6181 } 6184 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 676 err_alloc:
677 for_each_possible_cpu(err_cpu) { 677 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 678 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
680 if (err_cpu == cpu) 680 if (err_cpu == cpu)
681 break; 681 break;
682 } 682 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
27#include <linux/pagemap.h> /* read_mapping_page */ 27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/export.h>
30#include <linux/rmap.h> /* anon_vma_prepare */ 31#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */ 32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */ 33#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
41#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 42#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
42 43
43static struct rb_root uprobes_tree = RB_ROOT; 44static struct rb_root uprobes_tree = RB_ROOT;
44
45static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
46
47#define UPROBES_HASH_SZ 13
48
49/* 45/*
50 * We need separate register/unregister and mmap/munmap lock hashes because 46 * allows us to skip the uprobe_mmap if there are no uprobe events active
51 * of mmap_sem nesting. 47 * at this time. Probably a fine grained per inode count is better?
52 *
53 * uprobe_register() needs to install probes on (potentially) all processes
54 * and thus needs to acquire multiple mmap_sems (consequtively, not
55 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
56 * for the particular process doing the mmap.
57 *
58 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
59 * because of lock order against i_mmap_mutex. This means there's a hole in
60 * the register vma iteration where a mmap() can happen.
61 *
62 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
63 * install a probe where one is already installed.
64 */ 48 */
49#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
65 50
66/* serialize (un)register */ 51static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
67static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
68
69#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
70 52
53#define UPROBES_HASH_SZ 13
71/* serialize uprobe->pending_list */ 54/* serialize uprobe->pending_list */
72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 55static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 56#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
74 57
75static struct percpu_rw_semaphore dup_mmap_sem; 58static struct percpu_rw_semaphore dup_mmap_sem;
76 59
77/*
78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
79 * events active at this time. Probably a fine grained per inode count is
80 * better?
81 */
82static atomic_t uprobe_events = ATOMIC_INIT(0);
83
84/* Have a copy of original instruction */ 60/* Have a copy of original instruction */
85#define UPROBE_COPY_INSN 0 61#define UPROBE_COPY_INSN 0
86/* Dont run handlers when first register/ last unregister in progress*/
87#define UPROBE_RUN_HANDLER 1
88/* Can skip singlestep */ 62/* Can skip singlestep */
89#define UPROBE_SKIP_SSTEP 2 63#define UPROBE_SKIP_SSTEP 1
90 64
91struct uprobe { 65struct uprobe {
92 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
93 atomic_t ref; 67 atomic_t ref;
68 struct rw_semaphore register_rwsem;
94 struct rw_semaphore consumer_rwsem; 69 struct rw_semaphore consumer_rwsem;
95 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
96 struct list_head pending_list; 70 struct list_head pending_list;
97 struct uprobe_consumer *consumers; 71 struct uprobe_consumer *consumers;
98 struct inode *inode; /* Also hold a ref to inode */ 72 struct inode *inode; /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
430 u = __insert_uprobe(uprobe); 404 u = __insert_uprobe(uprobe);
431 spin_unlock(&uprobes_treelock); 405 spin_unlock(&uprobes_treelock);
432 406
433 /* For now assume that the instruction need not be single-stepped */
434 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
435
436 return u; 407 return u;
437} 408}
438 409
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
452 423
453 uprobe->inode = igrab(inode); 424 uprobe->inode = igrab(inode);
454 uprobe->offset = offset; 425 uprobe->offset = offset;
426 init_rwsem(&uprobe->register_rwsem);
455 init_rwsem(&uprobe->consumer_rwsem); 427 init_rwsem(&uprobe->consumer_rwsem);
456 mutex_init(&uprobe->copy_mutex); 428 /* For now assume that the instruction need not be single-stepped */
429 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
457 430
458 /* add to uprobes_tree, sorted on inode:offset */ 431 /* add to uprobes_tree, sorted on inode:offset */
459 cur_uprobe = insert_uprobe(uprobe); 432 cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
463 kfree(uprobe); 436 kfree(uprobe);
464 uprobe = cur_uprobe; 437 uprobe = cur_uprobe;
465 iput(inode); 438 iput(inode);
466 } else {
467 atomic_inc(&uprobe_events);
468 } 439 }
469 440
470 return uprobe; 441 return uprobe;
471} 442}
472 443
473static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) 444static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
474{
475 struct uprobe_consumer *uc;
476
477 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
478 return;
479
480 down_read(&uprobe->consumer_rwsem);
481 for (uc = uprobe->consumers; uc; uc = uc->next) {
482 if (!uc->filter || uc->filter(uc, current))
483 uc->handler(uc, regs);
484 }
485 up_read(&uprobe->consumer_rwsem);
486}
487
488/* Returns the previous consumer */
489static struct uprobe_consumer *
490consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
491{ 445{
492 down_write(&uprobe->consumer_rwsem); 446 down_write(&uprobe->consumer_rwsem);
493 uc->next = uprobe->consumers; 447 uc->next = uprobe->consumers;
494 uprobe->consumers = uc; 448 uprobe->consumers = uc;
495 up_write(&uprobe->consumer_rwsem); 449 up_write(&uprobe->consumer_rwsem);
496
497 return uc->next;
498} 450}
499 451
500/* 452/*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
588 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 540 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
589 return ret; 541 return ret;
590 542
591 mutex_lock(&uprobe->copy_mutex); 543 /* TODO: move this into _register, until then we abuse this sem. */
544 down_write(&uprobe->consumer_rwsem);
592 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 545 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
593 goto out; 546 goto out;
594 547
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
612 set_bit(UPROBE_COPY_INSN, &uprobe->flags); 565 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
613 566
614 out: 567 out:
615 mutex_unlock(&uprobe->copy_mutex); 568 up_write(&uprobe->consumer_rwsem);
569
570 return ret;
571}
572
573static inline bool consumer_filter(struct uprobe_consumer *uc,
574 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
575{
576 return !uc->filter || uc->filter(uc, ctx, mm);
577}
578
579static bool filter_chain(struct uprobe *uprobe,
580 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
581{
582 struct uprobe_consumer *uc;
583 bool ret = false;
584
585 down_read(&uprobe->consumer_rwsem);
586 for (uc = uprobe->consumers; uc; uc = uc->next) {
587 ret = consumer_filter(uc, ctx, mm);
588 if (ret)
589 break;
590 }
591 up_read(&uprobe->consumer_rwsem);
616 592
617 return ret; 593 return ret;
618} 594}
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
624 bool first_uprobe; 600 bool first_uprobe;
625 int ret; 601 int ret;
626 602
627 /*
628 * If probe is being deleted, unregister thread could be done with
629 * the vma-rmap-walk through. Adding a probe now can be fatal since
630 * nobody will be able to cleanup. Also we could be from fork or
631 * mremap path, where the probe might have already been inserted.
632 * Hence behave as if probe already existed.
633 */
634 if (!uprobe->consumers)
635 return 0;
636
637 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); 603 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
638 if (ret) 604 if (ret)
639 return ret; 605 return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
658static int 624static int
659remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 625remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
660{ 626{
661 /* can happen if uprobe_register() fails */
662 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
663 return 0;
664
665 set_bit(MMF_RECALC_UPROBES, &mm->flags); 627 set_bit(MMF_RECALC_UPROBES, &mm->flags);
666 return set_orig_insn(&uprobe->arch, mm, vaddr); 628 return set_orig_insn(&uprobe->arch, mm, vaddr);
667} 629}
668 630
631static inline bool uprobe_is_active(struct uprobe *uprobe)
632{
633 return !RB_EMPTY_NODE(&uprobe->rb_node);
634}
669/* 635/*
670 * There could be threads that have already hit the breakpoint. They 636 * There could be threads that have already hit the breakpoint. They
671 * will recheck the current insn and restart if find_uprobe() fails. 637 * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
673 */ 639 */
674static void delete_uprobe(struct uprobe *uprobe) 640static void delete_uprobe(struct uprobe *uprobe)
675{ 641{
642 if (WARN_ON(!uprobe_is_active(uprobe)))
643 return;
644
676 spin_lock(&uprobes_treelock); 645 spin_lock(&uprobes_treelock);
677 rb_erase(&uprobe->rb_node, &uprobes_tree); 646 rb_erase(&uprobe->rb_node, &uprobes_tree);
678 spin_unlock(&uprobes_treelock); 647 spin_unlock(&uprobes_treelock);
648 RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
679 iput(uprobe->inode); 649 iput(uprobe->inode);
680 put_uprobe(uprobe); 650 put_uprobe(uprobe);
681 atomic_dec(&uprobe_events);
682} 651}
683 652
684struct map_info { 653struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
764 return curr; 733 return curr;
765} 734}
766 735
767static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 736static int
737register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
768{ 738{
739 bool is_register = !!new;
769 struct map_info *info; 740 struct map_info *info;
770 int err = 0; 741 int err = 0;
771 742
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
794 vaddr_to_offset(vma, info->vaddr) != uprobe->offset) 765 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
795 goto unlock; 766 goto unlock;
796 767
797 if (is_register) 768 if (is_register) {
798 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 769 /* consult only the "caller", new consumer. */
799 else 770 if (consumer_filter(new,
800 err |= remove_breakpoint(uprobe, mm, info->vaddr); 771 UPROBE_FILTER_REGISTER, mm))
772 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
773 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
774 if (!filter_chain(uprobe,
775 UPROBE_FILTER_UNREGISTER, mm))
776 err |= remove_breakpoint(uprobe, mm, info->vaddr);
777 }
801 778
802 unlock: 779 unlock:
803 up_write(&mm->mmap_sem); 780 up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
810 return err; 787 return err;
811} 788}
812 789
813static int __uprobe_register(struct uprobe *uprobe) 790static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
814{ 791{
815 return register_for_each_vma(uprobe, true); 792 consumer_add(uprobe, uc);
793 return register_for_each_vma(uprobe, uc);
816} 794}
817 795
818static void __uprobe_unregister(struct uprobe *uprobe) 796static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
819{ 797{
820 if (!register_for_each_vma(uprobe, false)) 798 int err;
821 delete_uprobe(uprobe); 799
800 if (!consumer_del(uprobe, uc)) /* WARN? */
801 return;
822 802
803 err = register_for_each_vma(uprobe, NULL);
823 /* TODO : cant unregister? schedule a worker thread */ 804 /* TODO : cant unregister? schedule a worker thread */
805 if (!uprobe->consumers && !err)
806 delete_uprobe(uprobe);
824} 807}
825 808
826/* 809/*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
845 struct uprobe *uprobe; 828 struct uprobe *uprobe;
846 int ret; 829 int ret;
847 830
848 if (!inode || !uc || uc->next) 831 /* Racy, just to catch the obvious mistakes */
849 return -EINVAL;
850
851 if (offset > i_size_read(inode)) 832 if (offset > i_size_read(inode))
852 return -EINVAL; 833 return -EINVAL;
853 834
854 ret = 0; 835 retry:
855 mutex_lock(uprobes_hash(inode));
856 uprobe = alloc_uprobe(inode, offset); 836 uprobe = alloc_uprobe(inode, offset);
857 837 if (!uprobe)
858 if (!uprobe) { 838 return -ENOMEM;
859 ret = -ENOMEM; 839 /*
860 } else if (!consumer_add(uprobe, uc)) { 840 * We can race with uprobe_unregister()->delete_uprobe().
861 ret = __uprobe_register(uprobe); 841 * Check uprobe_is_active() and retry if it is false.
862 if (ret) { 842 */
863 uprobe->consumers = NULL; 843 down_write(&uprobe->register_rwsem);
864 __uprobe_unregister(uprobe); 844 ret = -EAGAIN;
865 } else { 845 if (likely(uprobe_is_active(uprobe))) {
866 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 846 ret = __uprobe_register(uprobe, uc);
867 } 847 if (ret)
848 __uprobe_unregister(uprobe, uc);
868 } 849 }
850 up_write(&uprobe->register_rwsem);
851 put_uprobe(uprobe);
869 852
870 mutex_unlock(uprobes_hash(inode)); 853 if (unlikely(ret == -EAGAIN))
871 if (uprobe) 854 goto retry;
872 put_uprobe(uprobe); 855 return ret;
856}
857EXPORT_SYMBOL_GPL(uprobe_register);
858
859/*
860 * uprobe_apply - unregister a already registered probe.
861 * @inode: the file in which the probe has to be removed.
862 * @offset: offset from the start of the file.
863 * @uc: consumer which wants to add more or remove some breakpoints
864 * @add: add or remove the breakpoints
865 */
866int uprobe_apply(struct inode *inode, loff_t offset,
867 struct uprobe_consumer *uc, bool add)
868{
869 struct uprobe *uprobe;
870 struct uprobe_consumer *con;
871 int ret = -ENOENT;
872
873 uprobe = find_uprobe(inode, offset);
874 if (!uprobe)
875 return ret;
876
877 down_write(&uprobe->register_rwsem);
878 for (con = uprobe->consumers; con && con != uc ; con = con->next)
879 ;
880 if (con)
881 ret = register_for_each_vma(uprobe, add ? uc : NULL);
882 up_write(&uprobe->register_rwsem);
883 put_uprobe(uprobe);
873 884
874 return ret; 885 return ret;
875} 886}
@@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
884{ 895{
885 struct uprobe *uprobe; 896 struct uprobe *uprobe;
886 897
887 if (!inode || !uc)
888 return;
889
890 uprobe = find_uprobe(inode, offset); 898 uprobe = find_uprobe(inode, offset);
891 if (!uprobe) 899 if (!uprobe)
892 return; 900 return;
893 901
894 mutex_lock(uprobes_hash(inode)); 902 down_write(&uprobe->register_rwsem);
903 __uprobe_unregister(uprobe, uc);
904 up_write(&uprobe->register_rwsem);
905 put_uprobe(uprobe);
906}
907EXPORT_SYMBOL_GPL(uprobe_unregister);
895 908
896 if (consumer_del(uprobe, uc)) { 909static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
897 if (!uprobe->consumers) { 910{
898 __uprobe_unregister(uprobe); 911 struct vm_area_struct *vma;
899 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 912 int err = 0;
900 } 913
914 down_read(&mm->mmap_sem);
915 for (vma = mm->mmap; vma; vma = vma->vm_next) {
916 unsigned long vaddr;
917 loff_t offset;
918
919 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode)
921 continue;
922
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
924 if (uprobe->offset < offset ||
925 uprobe->offset >= offset + vma->vm_end - vma->vm_start)
926 continue;
927
928 vaddr = offset_to_vaddr(vma, uprobe->offset);
929 err |= remove_breakpoint(uprobe, mm, vaddr);
901 } 930 }
931 up_read(&mm->mmap_sem);
902 932
903 mutex_unlock(uprobes_hash(inode)); 933 return err;
904 if (uprobe)
905 put_uprobe(uprobe);
906} 934}
907 935
908static struct rb_node * 936static struct rb_node *
@@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
979 struct uprobe *uprobe, *u; 1007 struct uprobe *uprobe, *u;
980 struct inode *inode; 1008 struct inode *inode;
981 1009
982 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) 1010 if (no_uprobe_events() || !valid_vma(vma, true))
983 return 0; 1011 return 0;
984 1012
985 inode = vma->vm_file->f_mapping->host; 1013 inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
988 1016
989 mutex_lock(uprobes_mmap_hash(inode)); 1017 mutex_lock(uprobes_mmap_hash(inode));
990 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); 1018 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
991 1019 /*
1020 * We can race with uprobe_unregister(), this uprobe can be already
1021 * removed. But in this case filter_chain() must return false, all
1022 * consumers have gone away.
1023 */
992 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1024 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
993 if (!fatal_signal_pending(current)) { 1025 if (!fatal_signal_pending(current) &&
1026 filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
994 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 1027 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
995 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1028 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
996 } 1029 }
@@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1025 */ 1058 */
1026void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1059void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1027{ 1060{
1028 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1061 if (no_uprobe_events() || !valid_vma(vma, false))
1029 return; 1062 return;
1030 1063
1031 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ 1064 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1042/* Slot allocation for XOL */ 1075/* Slot allocation for XOL */
1043static int xol_add_vma(struct xol_area *area) 1076static int xol_add_vma(struct xol_area *area)
1044{ 1077{
1045 struct mm_struct *mm; 1078 struct mm_struct *mm = current->mm;
1046 int ret; 1079 int ret = -EALREADY;
1047
1048 area->page = alloc_page(GFP_HIGHUSER);
1049 if (!area->page)
1050 return -ENOMEM;
1051
1052 ret = -EALREADY;
1053 mm = current->mm;
1054 1080
1055 down_write(&mm->mmap_sem); 1081 down_write(&mm->mmap_sem);
1056 if (mm->uprobes_state.xol_area) 1082 if (mm->uprobes_state.xol_area)
1057 goto fail; 1083 goto fail;
1058 1084
1059 ret = -ENOMEM; 1085 ret = -ENOMEM;
1060
1061 /* Try to map as high as possible, this is only a hint. */ 1086 /* Try to map as high as possible, this is only a hint. */
1062 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1087 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1063 if (area->vaddr & ~PAGE_MASK) { 1088 if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
1073 smp_wmb(); /* pairs with get_xol_area() */ 1098 smp_wmb(); /* pairs with get_xol_area() */
1074 mm->uprobes_state.xol_area = area; 1099 mm->uprobes_state.xol_area = area;
1075 ret = 0; 1100 ret = 0;
1076 1101 fail:
1077fail:
1078 up_write(&mm->mmap_sem); 1102 up_write(&mm->mmap_sem);
1079 if (ret)
1080 __free_page(area->page);
1081 1103
1082 return ret; 1104 return ret;
1083} 1105}
1084 1106
1085static struct xol_area *get_xol_area(struct mm_struct *mm)
1086{
1087 struct xol_area *area;
1088
1089 area = mm->uprobes_state.xol_area;
1090 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1091
1092 return area;
1093}
1094
1095/* 1107/*
1096 * xol_alloc_area - Allocate process's xol_area. 1108 * get_xol_area - Allocate process's xol_area if necessary.
1097 * This area will be used for storing instructions for execution out of 1109 * This area will be used for storing instructions for execution out of line.
1098 * line.
1099 * 1110 *
1100 * Returns the allocated area or NULL. 1111 * Returns the allocated area or NULL.
1101 */ 1112 */
1102static struct xol_area *xol_alloc_area(void) 1113static struct xol_area *get_xol_area(void)
1103{ 1114{
1115 struct mm_struct *mm = current->mm;
1104 struct xol_area *area; 1116 struct xol_area *area;
1105 1117
1118 area = mm->uprobes_state.xol_area;
1119 if (area)
1120 goto ret;
1121
1106 area = kzalloc(sizeof(*area), GFP_KERNEL); 1122 area = kzalloc(sizeof(*area), GFP_KERNEL);
1107 if (unlikely(!area)) 1123 if (unlikely(!area))
1108 return NULL; 1124 goto out;
1109 1125
1110 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); 1126 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1111
1112 if (!area->bitmap) 1127 if (!area->bitmap)
1113 goto fail; 1128 goto free_area;
1129
1130 area->page = alloc_page(GFP_HIGHUSER);
1131 if (!area->page)
1132 goto free_bitmap;
1114 1133
1115 init_waitqueue_head(&area->wq); 1134 init_waitqueue_head(&area->wq);
1116 if (!xol_add_vma(area)) 1135 if (!xol_add_vma(area))
1117 return area; 1136 return area;
1118 1137
1119fail: 1138 __free_page(area->page);
1139 free_bitmap:
1120 kfree(area->bitmap); 1140 kfree(area->bitmap);
1141 free_area:
1121 kfree(area); 1142 kfree(area);
1122 1143 out:
1123 return get_xol_area(current->mm); 1144 area = mm->uprobes_state.xol_area;
1145 ret:
1146 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1147 return area;
1124} 1148}
1125 1149
1126/* 1150/*
@@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1186} 1210}
1187 1211
1188/* 1212/*
1189 * xol_get_insn_slot - If was not allocated a slot, then 1213 * xol_get_insn_slot - allocate a slot for xol.
1190 * allocate a slot.
1191 * Returns the allocated slot address or 0. 1214 * Returns the allocated slot address or 0.
1192 */ 1215 */
1193static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) 1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1194{ 1217{
1195 struct xol_area *area; 1218 struct xol_area *area;
1196 unsigned long offset; 1219 unsigned long offset;
1220 unsigned long xol_vaddr;
1197 void *vaddr; 1221 void *vaddr;
1198 1222
1199 area = get_xol_area(current->mm); 1223 area = get_xol_area();
1200 if (!area) { 1224 if (!area)
1201 area = xol_alloc_area(); 1225 return 0;
1202 if (!area)
1203 return 0;
1204 }
1205 current->utask->xol_vaddr = xol_take_insn_slot(area);
1206 1226
1207 /* 1227 xol_vaddr = xol_take_insn_slot(area);
1208 * Initialize the slot if xol_vaddr points to valid 1228 if (unlikely(!xol_vaddr))
1209 * instruction slot.
1210 */
1211 if (unlikely(!current->utask->xol_vaddr))
1212 return 0; 1229 return 0;
1213 1230
1214 current->utask->vaddr = slot_addr; 1231 /* Initialize the slot */
1215 offset = current->utask->xol_vaddr & ~PAGE_MASK; 1232 offset = xol_vaddr & ~PAGE_MASK;
1216 vaddr = kmap_atomic(area->page); 1233 vaddr = kmap_atomic(area->page);
1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1218 kunmap_atomic(vaddr); 1235 kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1222 */ 1239 */
1223 flush_dcache_page(area->page); 1240 flush_dcache_page(area->page);
1224 1241
1225 return current->utask->xol_vaddr; 1242 return xol_vaddr;
1226} 1243}
1227 1244
1228/* 1245/*
@@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1240 return; 1257 return;
1241 1258
1242 slot_addr = tsk->utask->xol_vaddr; 1259 slot_addr = tsk->utask->xol_vaddr;
1243 1260 if (unlikely(!slot_addr))
1244 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1245 return; 1261 return;
1246 1262
1247 area = tsk->mm->uprobes_state.xol_area; 1263 area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
1303} 1319}
1304 1320
1305/* 1321/*
1306 * Allocate a uprobe_task object for the task. 1322 * Allocate a uprobe_task object for the task if if necessary.
1307 * Called when the thread hits a breakpoint for the first time. 1323 * Called when the thread hits a breakpoint.
1308 * 1324 *
1309 * Returns: 1325 * Returns:
1310 * - pointer to new uprobe_task on success 1326 * - pointer to new uprobe_task on success
1311 * - NULL otherwise 1327 * - NULL otherwise
1312 */ 1328 */
1313static struct uprobe_task *add_utask(void) 1329static struct uprobe_task *get_utask(void)
1314{ 1330{
1315 struct uprobe_task *utask; 1331 if (!current->utask)
1316 1332 current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1317 utask = kzalloc(sizeof *utask, GFP_KERNEL); 1333 return current->utask;
1318 if (unlikely(!utask))
1319 return NULL;
1320
1321 current->utask = utask;
1322 return utask;
1323} 1334}
1324 1335
1325/* Prepare to single-step probed instruction out of line. */ 1336/* Prepare to single-step probed instruction out of line. */
1326static int 1337static int
1327pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) 1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1328{ 1339{
1329 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) 1340 struct uprobe_task *utask;
1330 return 0; 1341 unsigned long xol_vaddr;
1342 int err;
1343
1344 utask = get_utask();
1345 if (!utask)
1346 return -ENOMEM;
1347
1348 xol_vaddr = xol_get_insn_slot(uprobe);
1349 if (!xol_vaddr)
1350 return -ENOMEM;
1351
1352 utask->xol_vaddr = xol_vaddr;
1353 utask->vaddr = bp_vaddr;
1354
1355 err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1356 if (unlikely(err)) {
1357 xol_free_insn_slot(current);
1358 return err;
1359 }
1331 1360
1332 return -EFAULT; 1361 utask->active_uprobe = uprobe;
1362 utask->state = UTASK_SSTEP;
1363 return 0;
1333} 1364}
1334 1365
1335/* 1366/*
@@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1391 * This is not strictly accurate, we can race with 1422 * This is not strictly accurate, we can race with
1392 * uprobe_unregister() and see the already removed 1423 * uprobe_unregister() and see the already removed
1393 * uprobe if delete_uprobe() was not yet called. 1424 * uprobe if delete_uprobe() was not yet called.
1425 * Or this uprobe can be filtered out.
1394 */ 1426 */
1395 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) 1427 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1396 return; 1428 return;
@@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1452 return uprobe; 1484 return uprobe;
1453} 1485}
1454 1486
1487static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{
1489 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE;
1491
1492 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs);
1495
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc;
1499 }
1500
1501 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm);
1504 }
1505 up_read(&uprobe->register_rwsem);
1506}
1507
1455/* 1508/*
1456 * Run handler and ask thread to singlestep. 1509 * Run handler and ask thread to singlestep.
1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1458 */ 1511 */
1459static void handle_swbp(struct pt_regs *regs) 1512static void handle_swbp(struct pt_regs *regs)
1460{ 1513{
1461 struct uprobe_task *utask;
1462 struct uprobe *uprobe; 1514 struct uprobe *uprobe;
1463 unsigned long bp_vaddr; 1515 unsigned long bp_vaddr;
1464 int uninitialized_var(is_swbp); 1516 int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
1483 } 1535 }
1484 return; 1536 return;
1485 } 1537 }
1538
1539 /* change it in advance for ->handler() and restart */
1540 instruction_pointer_set(regs, bp_vaddr);
1541
1486 /* 1542 /*
1487 * TODO: move copy_insn/etc into _register and remove this hack. 1543 * TODO: move copy_insn/etc into _register and remove this hack.
1488 * After we hit the bp, _unregister + _register can install the 1544 * After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
1490 */ 1546 */
1491 smp_rmb(); /* pairs with wmb() in install_breakpoint() */ 1547 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1492 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1548 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1493 goto restart; 1549 goto out;
1494
1495 utask = current->utask;
1496 if (!utask) {
1497 utask = add_utask();
1498 /* Cannot allocate; re-execute the instruction. */
1499 if (!utask)
1500 goto restart;
1501 }
1502 1550
1503 handler_chain(uprobe, regs); 1551 handler_chain(uprobe, regs);
1504 if (can_skip_sstep(uprobe, regs)) 1552 if (can_skip_sstep(uprobe, regs))
1505 goto out; 1553 goto out;
1506 1554
1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1555 if (!pre_ssout(uprobe, regs, bp_vaddr))
1508 utask->active_uprobe = uprobe;
1509 utask->state = UTASK_SSTEP;
1510 return; 1556 return;
1511 }
1512 1557
1513restart: 1558 /* can_skip_sstep() succeeded, or restart if can't singlestep */
1514 /*
1515 * cannot singlestep; cannot skip instruction;
1516 * re-execute the instruction.
1517 */
1518 instruction_pointer_set(regs, bp_vaddr);
1519out: 1559out:
1520 put_uprobe(uprobe); 1560 put_uprobe(uprobe);
1521} 1561}
@@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)
1609{ 1649{
1610 int i; 1650 int i;
1611 1651
1612 for (i = 0; i < UPROBES_HASH_SZ; i++) { 1652 for (i = 0; i < UPROBES_HASH_SZ; i++)
1613 mutex_init(&uprobes_mutex[i]);
1614 mutex_init(&uprobes_mmap_mutex[i]); 1653 mutex_init(&uprobes_mmap_mutex[i]);
1615 }
1616 1654
1617 if (percpu_init_rwsem(&dup_mmap_sem)) 1655 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM; 1656 return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df21937216..7dd20408707c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
85 bool group_dead = thread_group_leader(tsk); 85 bool group_dead = thread_group_leader(tsk);
86 struct sighand_struct *sighand; 86 struct sighand_struct *sighand;
87 struct tty_struct *uninitialized_var(tty); 87 struct tty_struct *uninitialized_var(tty);
88 cputime_t utime, stime;
88 89
89 sighand = rcu_dereference_check(tsk->sighand, 90 sighand = rcu_dereference_check(tsk->sighand,
90 lockdep_tasklist_lock_is_held()); 91 lockdep_tasklist_lock_is_held());
@@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
123 * We won't ever get here for the group leader, since it 124 * We won't ever get here for the group leader, since it
124 * will have been the last reference on the signal_struct. 125 * will have been the last reference on the signal_struct.
125 */ 126 */
126 sig->utime += tsk->utime; 127 task_cputime(tsk, &utime, &stime);
127 sig->stime += tsk->stime; 128 sig->utime += utime;
128 sig->gtime += tsk->gtime; 129 sig->stime += stime;
130 sig->gtime += task_gtime(tsk);
129 sig->min_flt += tsk->min_flt; 131 sig->min_flt += tsk->min_flt;
130 sig->maj_flt += tsk->maj_flt; 132 sig->maj_flt += tsk->maj_flt;
131 sig->nvcsw += tsk->nvcsw; 133 sig->nvcsw += tsk->nvcsw;
@@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1092 sig = p->signal; 1094 sig = p->signal;
1093 psig->cutime += tgutime + sig->cutime; 1095 psig->cutime += tgutime + sig->cutime;
1094 psig->cstime += tgstime + sig->cstime; 1096 psig->cstime += tgstime + sig->cstime;
1095 psig->cgtime += p->gtime + sig->gtime + sig->cgtime; 1097 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1096 psig->cmin_flt += 1098 psig->cmin_flt +=
1097 p->min_flt + sig->min_flt + sig->cmin_flt; 1099 p->min_flt + sig->min_flt + sig->cmin_flt;
1098 psig->cmaj_flt += 1100 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index c535f33bbb9c..4133876d8cd2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1234 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1234 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1235#endif 1235#endif
1236#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1237 seqlock_init(&p->vtime_seqlock);
1238 p->vtime_snap = 0;
1239 p->vtime_snap_whence = VTIME_SLEEPING;
1240#endif
1241
1236#if defined(SPLIT_RSS_COUNTING) 1242#if defined(SPLIT_RSS_COUNTING)
1237 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1243 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1238#endif 1244#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca003..9618b6e9fb36 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h>
63 64
64#include <asm/futex.h> 65#include <asm/futex.h>
65 66
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..cc47812d3feb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h>
47#include <linux/timer.h> 49#include <linux/timer.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
@@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
640 * and expiry check is done in the hrtimer_interrupt or in the softirq. 642 * and expiry check is done in the hrtimer_interrupt or in the softirq.
641 */ 643 */
642static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 644static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
643 struct hrtimer_clock_base *base, 645 struct hrtimer_clock_base *base)
644 int wakeup)
645{ 646{
646 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
647 if (wakeup) {
648 raw_spin_unlock(&base->cpu_base->lock);
649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
650 raw_spin_lock(&base->cpu_base->lock);
651 } else
652 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
653
654 return 1;
655 }
656
657 return 0;
658} 648}
659 649
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 650static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
735static inline void 725static inline void
736hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 726hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
737static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 727static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
738 struct hrtimer_clock_base *base, 728 struct hrtimer_clock_base *base)
739 int wakeup)
740{ 729{
741 return 0; 730 return 0;
742} 731}
@@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
995 * 984 *
996 * XXX send_remote_softirq() ? 985 * XXX send_remote_softirq() ?
997 */ 986 */
998 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 987 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
999 hrtimer_enqueue_reprogram(timer, new_base, wakeup); 988 && hrtimer_enqueue_reprogram(timer, new_base)) {
989 if (wakeup) {
990 /*
991 * We need to drop cpu_base->lock to avoid a
992 * lock ordering issue vs. rq->lock.
993 */
994 raw_spin_unlock(&new_base->cpu_base->lock);
995 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
996 local_irq_restore(flags);
997 return ret;
998 } else {
999 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1000 }
1001 }
1000 1002
1001 unlock_hrtimer_base(timer, &flags); 1003 unlock_hrtimer_base(timer, &flags);
1002 1004
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
90EXPORT_SYMBOL(irq_set_handler_data); 90EXPORT_SYMBOL(irq_set_handler_data);
91 91
92/** 92/**
93 * irq_set_msi_desc - set MSI descriptor data for an irq 93 * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
94 * @irq: Interrupt number 94 * @irq_base: Interrupt number base
95 * @entry: Pointer to MSI descriptor data 95 * @irq_offset: Interrupt number offset
96 * @entry: Pointer to MSI descriptor data
96 * 97 *
97 * Set the MSI descriptor entry for an irq 98 * Set the MSI descriptor entry for an irq at offset
98 */ 99 */
99int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 100int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
101 struct msi_desc *entry)
100{ 102{
101 unsigned long flags; 103 unsigned long flags;
102 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 104 struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
103 105
104 if (!desc) 106 if (!desc)
105 return -EINVAL; 107 return -EINVAL;
106 desc->irq_data.msi_desc = entry; 108 desc->irq_data.msi_desc = entry;
107 if (entry) 109 if (entry && !irq_offset)
108 entry->irq = irq; 110 entry->irq = irq_base;
109 irq_put_desc_unlock(desc, flags); 111 irq_put_desc_unlock(desc, flags);
110 return 0; 112 return 0;
111} 113}
112 114
113/** 115/**
116 * irq_set_msi_desc - set MSI descriptor data for an irq
117 * @irq: Interrupt number
118 * @entry: Pointer to MSI descriptor data
119 *
120 * Set the MSI descriptor entry for an irq
121 */
122int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
123{
124 return irq_set_msi_desc_off(irq, 0, entry);
125}
126
127/**
114 * irq_set_chip_data - set irq chip data for an irq 128 * irq_set_chip_data - set irq chip data for an irq
115 * @irq: Interrupt number 129 * @irq: Interrupt number
116 * @data: Pointer to chip specific data 130 * @data: Pointer to chip specific data
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa479..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/sched/rt.h>
19#include <linux/task_work.h> 20#include <linux/task_work.h>
20 21
21#include "internals.h" 22#include "internals.h"
@@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
1524out: 1525out:
1525 irq_put_desc_unlock(desc, flags); 1526 irq_put_desc_unlock(desc, flags);
1526} 1527}
1528EXPORT_SYMBOL_GPL(enable_percpu_irq);
1527 1529
1528void disable_percpu_irq(unsigned int irq) 1530void disable_percpu_irq(unsigned int irq)
1529{ 1531{
@@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
1537 irq_percpu_disable(desc, cpu); 1539 irq_percpu_disable(desc, cpu);
1538 irq_put_desc_unlock(desc, flags); 1540 irq_put_desc_unlock(desc, flags);
1539} 1541}
1542EXPORT_SYMBOL_GPL(disable_percpu_irq);
1540 1543
1541/* 1544/*
1542 * Internal function to unregister a percpu irqaction. 1545 * Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
80 80
81 /* 81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the 82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well. 83 * first.
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || 87 (action->flags & __IRQF_TIMER))
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
90 goto out; 88 goto out;
91 89
92 /* Already running on another processor */ 90 /* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
104 do { 102 do {
105 if (handle_irq_event(desc) == IRQ_HANDLED) 103 if (handle_irq_event(desc) == IRQ_HANDLED)
106 ret = IRQ_HANDLED; 104 ret = IRQ_HANDLED;
105 /* Make sure that there is still a valid action */
107 action = desc->action; 106 action = desc->action;
108 } while ((desc->istate & IRQS_PENDING) && action); 107 } while ((desc->istate & IRQS_PENDING) && action);
109 desc->istate &= ~IRQS_POLL_INPROGRESS; 108 desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/irqflags.h> 14#include <linux/irqflags.h>
15#include <linux/sched.h>
16#include <linux/tick.h>
17#include <linux/cpu.h>
18#include <linux/notifier.h>
15#include <asm/processor.h> 19#include <asm/processor.h>
16 20
17/*
18 * An entry can be in one of four states:
19 *
20 * free NULL, 0 -> {claimed} : free to be used
21 * claimed NULL, 3 -> {pending} : claimed to be enqueued
22 * pending next, 3 -> {busy} : queued, pending callback
23 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
24 */
25
26#define IRQ_WORK_PENDING 1UL
27#define IRQ_WORK_BUSY 2UL
28#define IRQ_WORK_FLAGS 3UL
29 21
30static DEFINE_PER_CPU(struct llist_head, irq_work_list); 22static DEFINE_PER_CPU(struct llist_head, irq_work_list);
23static DEFINE_PER_CPU(int, irq_work_raised);
31 24
32/* 25/*
33 * Claim the entry so that no one else will poke at it. 26 * Claim the entry so that no one else will poke at it.
34 */ 27 */
35static bool irq_work_claim(struct irq_work *work) 28static bool irq_work_claim(struct irq_work *work)
36{ 29{
37 unsigned long flags, nflags; 30 unsigned long flags, oflags, nflags;
38 31
32 /*
33 * Start with our best wish as a premise but only trust any
34 * flag value after cmpxchg() result.
35 */
36 flags = work->flags & ~IRQ_WORK_PENDING;
39 for (;;) { 37 for (;;) {
40 flags = work->flags;
41 if (flags & IRQ_WORK_PENDING)
42 return false;
43 nflags = flags | IRQ_WORK_FLAGS; 38 nflags = flags | IRQ_WORK_FLAGS;
44 if (cmpxchg(&work->flags, flags, nflags) == flags) 39 oflags = cmpxchg(&work->flags, flags, nflags);
40 if (oflags == flags)
45 break; 41 break;
42 if (oflags & IRQ_WORK_PENDING)
43 return false;
44 flags = oflags;
46 cpu_relax(); 45 cpu_relax();
47 } 46 }
48 47
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
57} 56}
58 57
59/* 58/*
60 * Queue the entry and raise the IPI if needed. 59 * Enqueue the irq_work @entry unless it's already pending
60 * somewhere.
61 *
62 * Can be re-enqueued while the callback is still in progress.
61 */ 63 */
62static void __irq_work_queue(struct irq_work *work) 64void irq_work_queue(struct irq_work *work)
63{ 65{
64 bool empty; 66 /* Only queue if not already pending */
67 if (!irq_work_claim(work))
68 return;
65 69
70 /* Queue the entry and raise the IPI if needed. */
66 preempt_disable(); 71 preempt_disable();
67 72
68 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
69 /* The list was empty, raise self-interrupt to start processing. */ 74
70 if (empty) 75 /*
71 arch_irq_work_raise(); 76 * If the work is not "lazy" or the tick is stopped, raise the irq
77 * work interrupt (if supported by the arch), otherwise, just wait
78 * for the next tick.
79 */
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise();
83 }
72 84
73 preempt_enable(); 85 preempt_enable();
74} 86}
87EXPORT_SYMBOL_GPL(irq_work_queue);
75 88
76/* 89bool irq_work_needs_cpu(void)
77 * Enqueue the irq_work @entry, returns true on success, failure when the
78 * @entry was already enqueued by someone else.
79 *
80 * Can be re-enqueued while the callback is still in progress.
81 */
82bool irq_work_queue(struct irq_work *work)
83{ 90{
84 if (!irq_work_claim(work)) { 91 struct llist_head *this_list;
85 /* 92
86 * Already enqueued, can't do! 93 this_list = &__get_cpu_var(irq_work_list);
87 */ 94 if (llist_empty(this_list))
88 return false; 95 return false;
89 }
90 96
91 __irq_work_queue(work); 97 /* All work should have been flushed before going offline */
98 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
99
92 return true; 100 return true;
93} 101}
94EXPORT_SYMBOL_GPL(irq_work_queue);
95 102
96/* 103static void __irq_work_run(void)
97 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
98 * context with local IRQs disabled.
99 */
100void irq_work_run(void)
101{ 104{
105 unsigned long flags;
102 struct irq_work *work; 106 struct irq_work *work;
103 struct llist_head *this_list; 107 struct llist_head *this_list;
104 struct llist_node *llnode; 108 struct llist_node *llnode;
105 109
110
111 /*
112 * Reset the "raised" state right before we check the list because
113 * an NMI may enqueue after we find the list empty from the runner.
114 */
115 __this_cpu_write(irq_work_raised, 0);
116 barrier();
117
106 this_list = &__get_cpu_var(irq_work_list); 118 this_list = &__get_cpu_var(irq_work_list);
107 if (llist_empty(this_list)) 119 if (llist_empty(this_list))
108 return; 120 return;
109 121
110 BUG_ON(!in_irq());
111 BUG_ON(!irqs_disabled()); 122 BUG_ON(!irqs_disabled());
112 123
113 llnode = llist_del_all(this_list); 124 llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
119 /* 130 /*
120 * Clear the PENDING bit, after this point the @work 131 * Clear the PENDING bit, after this point the @work
121 * can be re-used. 132 * can be re-used.
133 * Make it immediately visible so that other CPUs trying
134 * to claim that work don't rely on us to handle their data
135 * while we are in the middle of the func.
122 */ 136 */
123 work->flags = IRQ_WORK_BUSY; 137 flags = work->flags & ~IRQ_WORK_PENDING;
138 xchg(&work->flags, flags);
139
124 work->func(work); 140 work->func(work);
125 /* 141 /*
126 * Clear the BUSY bit and return to the free state if 142 * Clear the BUSY bit and return to the free state if
127 * no-one else claimed it meanwhile. 143 * no-one else claimed it meanwhile.
128 */ 144 */
129 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); 145 (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
130 } 146 }
131} 147}
148
149/*
150 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
151 * context with local IRQs disabled.
152 */
153void irq_work_run(void)
154{
155 BUG_ON(!in_irq());
156 __irq_work_run();
157}
132EXPORT_SYMBOL_GPL(irq_work_run); 158EXPORT_SYMBOL_GPL(irq_work_run);
133 159
134/* 160/*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
143 cpu_relax(); 169 cpu_relax();
144} 170}
145EXPORT_SYMBOL_GPL(irq_work_sync); 171EXPORT_SYMBOL_GPL(irq_work_sync);
172
173#ifdef CONFIG_HOTPLUG_CPU
174static int irq_work_cpu_notify(struct notifier_block *self,
175 unsigned long action, void *hcpu)
176{
177 long cpu = (long)hcpu;
178
179 switch (action) {
180 case CPU_DYING:
181 /* Called from stop_machine */
182 if (WARN_ON_ONCE(cpu != smp_processor_id()))
183 break;
184 __irq_work_run();
185 break;
186 default:
187 break;
188 }
189 return NOTIFY_OK;
190}
191
192static struct notifier_block cpu_notify;
193
194static __init int irq_work_init_cpu_notifier(void)
195{
196 cpu_notify.notifier_call = irq_work_cpu_notify;
197 cpu_notify.priority = 0;
198 register_cpu_notifier(&cpu_notify);
199 return 0;
200}
201device_initcall(irq_work_init_cpu_notifier);
202
203#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de6..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h> 39#include <linux/rwsem.h>
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include <trace/events/module.h> 44#include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
130#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 131#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
131 static int kmod_loop_msg; 132 static int kmod_loop_msg;
132 133
134 /*
135 * We don't allow synchronous module loading from async. Module
136 * init may invoke async_synchronize_full() which will end up
137 * waiting for this task which already is waiting for the module
138 * loading to complete, leading to a deadlock.
139 */
140 WARN_ON_ONCE(wait && current_is_async());
141
133 va_start(args, fmt); 142 va_start(args, fmt);
134 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 143 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
135 va_end(args); 144 va_end(args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..550294d58a02 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -471,7 +471,6 @@ static LIST_HEAD(unoptimizing_list);
471 471
472static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
475#define OPTIMIZE_DELAY 5 474#define OPTIMIZE_DELAY 5
476 475
477/* 476/*
@@ -552,8 +551,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
552/* Start optimizer after OPTIMIZE_DELAY passed */ 551/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void) 552static __kprobes void kick_kprobe_optimizer(void)
554{ 553{
555 if (!delayed_work_pending(&optimizing_work)) 554 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557} 555}
558 556
559/* Kprobe jump optimizer */ 557/* Kprobe jump optimizer */
@@ -592,16 +590,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
592 /* Step 5: Kick optimizer again if needed */ 590 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) 591 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer(); 592 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598} 593}
599 594
600/* Wait for completing optimization and unoptimization */ 595/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void) 596static __kprobes void wait_for_kprobe_optimizer(void)
602{ 597{
603 if (delayed_work_pending(&optimizing_work)) 598 mutex_lock(&kprobe_mutex);
604 wait_for_completion(&optimizer_comp); 599
600 while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
601 mutex_unlock(&kprobe_mutex);
602
603 /* this will also make optimizing_work execute immmediately */
604 flush_delayed_work(&optimizing_work);
605 /* @optimizing_work might not have been queued yet, relax */
606 cpu_relax();
607
608 mutex_lock(&kprobe_mutex);
609 }
610
611 mutex_unlock(&kprobe_mutex);
605} 612}
606 613
607/* Optimize kprobe if p is ready to be optimized */ 614/* Optimize kprobe if p is ready to be optimized */
@@ -919,7 +926,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
919} 926}
920#endif /* CONFIG_OPTPROBES */ 927#endif /* CONFIG_OPTPROBES */
921 928
922#ifdef KPROBES_CAN_USE_FTRACE 929#ifdef CONFIG_KPROBES_ON_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 930static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler, 931 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS, 932 .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +971,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
964 (unsigned long)p->addr, 1, 0); 971 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); 972 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966} 973}
967#else /* !KPROBES_CAN_USE_FTRACE */ 974#else /* !CONFIG_KPROBES_ON_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p) 975#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0) 976#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0) 977#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1414,12 +1421,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1414 */ 1421 */
1415 ftrace_addr = ftrace_location((unsigned long)p->addr); 1422 ftrace_addr = ftrace_location((unsigned long)p->addr);
1416 if (ftrace_addr) { 1423 if (ftrace_addr) {
1417#ifdef KPROBES_CAN_USE_FTRACE 1424#ifdef CONFIG_KPROBES_ON_FTRACE
1418 /* Given address is not on the instruction boundary */ 1425 /* Given address is not on the instruction boundary */
1419 if ((unsigned long)p->addr != ftrace_addr) 1426 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ; 1427 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE; 1428 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */ 1429#else /* !CONFIG_KPROBES_ON_FTRACE */
1423 return -EINVAL; 1430 return -EINVAL;
1424#endif 1431#endif
1425 } 1432 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/sched/rt.h>
22#include <linux/export.h> 23#include <linux/export.h>
23#include <linux/spinlock.h> 24#include <linux/spinlock.h>
24#include <linux/interrupt.h> 25#include <linux/interrupt.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index de9af600006f..f2c6a6825098 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -331,7 +331,7 @@ out:
331 return pid; 331 return pid;
332 332
333out_unlock: 333out_unlock:
334 spin_unlock(&pidmap_lock); 334 spin_unlock_irq(&pidmap_lock);
335out_free: 335out_free:
336 while (++i <= ns->level) 336 while (++i <= ns->level)
337 free_pidmap(pid->numbers + i); 337 free_pidmap(pid->numbers + i);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d6..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
155 155
156static inline cputime_t prof_ticks(struct task_struct *p) 156static inline cputime_t prof_ticks(struct task_struct *p)
157{ 157{
158 return p->utime + p->stime; 158 cputime_t utime, stime;
159
160 task_cputime(p, &utime, &stime);
161
162 return utime + stime;
159} 163}
160static inline cputime_t virt_ticks(struct task_struct *p) 164static inline cputime_t virt_ticks(struct task_struct *p)
161{ 165{
162 return p->utime; 166 cputime_t utime;
167
168 task_cputime(p, &utime, NULL);
169
170 return utime;
163} 171}
164 172
165static int 173static int
@@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head,
471 */ 479 */
472void posix_cpu_timers_exit(struct task_struct *tsk) 480void posix_cpu_timers_exit(struct task_struct *tsk)
473{ 481{
482 cputime_t utime, stime;
483
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 484 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long)); 485 sizeof(unsigned long long));
486 task_cputime(tsk, &utime, &stime);
476 cleanup_timers(tsk->cpu_timers, 487 cleanup_timers(tsk->cpu_timers,
477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 488 utime, stime, tsk->se.sum_exec_runtime);
478 489
479} 490}
480void posix_cpu_timers_exit_group(struct task_struct *tsk) 491void posix_cpu_timers_exit_group(struct task_struct *tsk)
481{ 492{
482 struct signal_struct *const sig = tsk->signal; 493 struct signal_struct *const sig = tsk->signal;
494 cputime_t utime, stime;
483 495
496 task_cputime(tsk, &utime, &stime);
484 cleanup_timers(tsk->signal->cpu_timers, 497 cleanup_timers(tsk->signal->cpu_timers,
485 tsk->utime + sig->utime, tsk->stime + sig->stime, 498 utime + sig->utime, stime + sig->stime,
486 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 499 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
487} 500}
488 501
@@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1226static inline int fastpath_timer_check(struct task_struct *tsk) 1239static inline int fastpath_timer_check(struct task_struct *tsk)
1227{ 1240{
1228 struct signal_struct *sig; 1241 struct signal_struct *sig;
1242 cputime_t utime, stime;
1243
1244 task_cputime(tsk, &utime, &stime);
1229 1245
1230 if (!task_cputime_zero(&tsk->cputime_expires)) { 1246 if (!task_cputime_zero(&tsk->cputime_expires)) {
1231 struct task_cputime task_sample = { 1247 struct task_cputime task_sample = {
1232 .utime = tsk->utime, 1248 .utime = utime,
1233 .stime = tsk->stime, 1249 .stime = stime,
1234 .sum_exec_runtime = tsk->se.sum_exec_runtime 1250 .sum_exec_runtime = tsk->se.sum_exec_runtime
1235 }; 1251 };
1236 1252
@@ -1401,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1401 while (!signal_pending(current)) { 1417 while (!signal_pending(current)) {
1402 if (timer.it.cpu.expires.sched == 0) { 1418 if (timer.it.cpu.expires.sched == 0) {
1403 /* 1419 /*
1404 * Our timer fired and was reset. 1420 * Our timer fired and was reset, below
1421 * deletion can not fail.
1405 */ 1422 */
1423 posix_cpu_timer_del(&timer);
1406 spin_unlock_irq(&timer.it_lock); 1424 spin_unlock_irq(&timer.it_lock);
1407 return 0; 1425 return 0;
1408 } 1426 }
@@ -1420,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1420 * We were interrupted by a signal. 1438 * We were interrupted by a signal.
1421 */ 1439 */
1422 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); 1440 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1423 posix_cpu_timer_set(&timer, 0, &zero_it, it); 1441 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1442 if (!error) {
1443 /*
1444 * Timer is now unarmed, deletion can not fail.
1445 */
1446 posix_cpu_timer_del(&timer);
1447 }
1424 spin_unlock_irq(&timer.it_lock); 1448 spin_unlock_irq(&timer.it_lock);
1425 1449
1450 while (error == TIMER_RETRY) {
1451 /*
1452 * We need to handle case when timer was or is in the
1453 * middle of firing. In other cases we already freed
1454 * resources.
1455 */
1456 spin_lock_irq(&timer.it_lock);
1457 error = posix_cpu_timer_del(&timer);
1458 spin_unlock_irq(&timer.it_lock);
1459 }
1460
1426 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { 1461 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1427 /* 1462 /*
1428 * It actually did fire already. 1463 * It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..10349d5f2ec3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -997,7 +997,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
997 997
998 err = kc->clock_adj(which_clock, &ktx); 998 err = kc->clock_adj(which_clock, &ktx);
999 999
1000 if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) 1000 if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
1001 return -EFAULT; 1001 return -EFAULT;
1002 1002
1003 return err; 1003 return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
66 66
67void queue_up_suspend_work(void) 67void queue_up_suspend_work(void)
68{ 68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) 69 if (autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work); 70 queue_work(autosleep_wq, &suspend_work);
71} 71}
72 72
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
313static suspend_state_t decode_state(const char *buf, size_t n) 313static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_STANDBY; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 const char * const *s;
318#endif 318#endif
319 char *p; 319 char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
553 553
554#endif /* CONFIG_PM_TRACE */ 554#endif /* CONFIG_PM_TRACE */
555 555
556#ifdef CONFIG_FREEZER
557static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
558 struct kobj_attribute *attr, char *buf)
559{
560 return sprintf(buf, "%u\n", freeze_timeout_msecs);
561}
562
563static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
564 struct kobj_attribute *attr,
565 const char *buf, size_t n)
566{
567 unsigned long val;
568
569 if (kstrtoul(buf, 10, &val))
570 return -EINVAL;
571
572 freeze_timeout_msecs = val;
573 return n;
574}
575
576power_attr(pm_freeze_timeout);
577
578#endif /* CONFIG_FREEZER*/
579
556static struct attribute * g[] = { 580static struct attribute * g[] = {
557 &state_attr.attr, 581 &state_attr.attr,
558#ifdef CONFIG_PM_TRACE 582#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
576 &pm_print_times_attr.attr, 600 &pm_print_times_attr.attr,
577#endif 601#endif
578#endif 602#endif
603#ifdef CONFIG_FREEZER
604 &pm_freeze_timeout_attr.attr,
605#endif
579 NULL, 606 NULL,
580}; 607};
581 608
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
21/* 21/*
22 * Timeout for stopping processes 22 * Timeout for stopping processes
23 */ 23 */
24#define TIMEOUT (20 * HZ) 24unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
25 25
26static int try_to_freeze_tasks(bool user_only) 26static int try_to_freeze_tasks(bool user_only)
27{ 27{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
36 36
37 do_gettimeofday(&start); 37 do_gettimeofday(&start);
38 38
39 end_time = jiffies + TIMEOUT; 39 end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
40 40
41 if (!user_only) 41 if (!user_only)
42 freeze_workqueues_begin(); 42 freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
359 return; 359 return;
360 } 360 }
361 361
362 if (delayed_work_pending(&req->work)) 362 cancel_delayed_work_sync(&req->work);
363 cancel_delayed_work_sync(&req->work);
364 363
365 if (new_value != req->node.prio) 364 if (new_value != req->node.prio)
366 pm_qos_update_target( 365 pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
386 "%s called for unknown object.", __func__)) 385 "%s called for unknown object.", __func__))
387 return; 386 return;
388 387
389 if (delayed_work_pending(&req->work)) 388 cancel_delayed_work_sync(&req->work);
390 cancel_delayed_work_sync(&req->work);
391 389
392 if (new_value != req->node.prio) 390 if (new_value != req->node.prio)
393 pm_qos_update_target( 391 pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 return; 414 return;
417 } 415 }
418 416
419 if (delayed_work_pending(&req->work)) 417 cancel_delayed_work_sync(&req->work);
420 cancel_delayed_work_sync(&req->work);
421 418
422 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
423 &req->node, PM_QOS_REMOVE_REQ, 420 &req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
30#include "power.h" 30#include "power.h"
31 31
32const char *const pm_states[PM_SUSPEND_MAX] = { 32const char *const pm_states[PM_SUSPEND_MAX] = {
33 [PM_SUSPEND_FREEZE] = "freeze",
33 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
34 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
35}; 36};
36 37
37static const struct platform_suspend_ops *suspend_ops; 38static const struct platform_suspend_ops *suspend_ops;
38 39
40static bool need_suspend_ops(suspend_state_t state)
41{
42 return !!(state > PM_SUSPEND_FREEZE);
43}
44
45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
46static bool suspend_freeze_wake;
47
48static void freeze_begin(void)
49{
50 suspend_freeze_wake = false;
51}
52
53static void freeze_enter(void)
54{
55 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
56}
57
58void freeze_wake(void)
59{
60 suspend_freeze_wake = true;
61 wake_up(&suspend_freeze_wait_head);
62}
63EXPORT_SYMBOL_GPL(freeze_wake);
64
39/** 65/**
40 * suspend_set_ops - Set the global suspend method table. 66 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Suspend operations to use. 67 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
50 76
51bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
52{ 78{
79 if (state == PM_SUSPEND_FREEZE)
80 return true;
53 /* 81 /*
54 * All states need lowlevel support and need to be valid to the lowlevel 82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel
55 * implementation, no valid callback implies that none are valid. 84 * implementation, no valid callback implies that none are valid.
56 */ 85 */
57 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); 86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
89 * hibernation). Run suspend notifiers, allocate the "suspend" console and 118 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes. 119 * freeze processes.
91 */ 120 */
92static int suspend_prepare(void) 121static int suspend_prepare(suspend_state_t state)
93{ 122{
94 int error; 123 int error;
95 124
96 if (!suspend_ops || !suspend_ops->enter) 125 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
97 return -EPERM; 126 return -EPERM;
98 127
99 pm_prepare_console(); 128 pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
137{ 166{
138 int error; 167 int error;
139 168
140 if (suspend_ops->prepare) { 169 if (need_suspend_ops(state) && suspend_ops->prepare) {
141 error = suspend_ops->prepare(); 170 error = suspend_ops->prepare();
142 if (error) 171 if (error)
143 goto Platform_finish; 172 goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
149 goto Platform_finish; 178 goto Platform_finish;
150 } 179 }
151 180
152 if (suspend_ops->prepare_late) { 181 if (need_suspend_ops(state) && suspend_ops->prepare_late) {
153 error = suspend_ops->prepare_late(); 182 error = suspend_ops->prepare_late();
154 if (error) 183 if (error)
155 goto Platform_wake; 184 goto Platform_wake;
156 } 185 }
157 186
187 /*
188 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors.
190 * Thus we should invoke freeze_enter() soon after
191 * all the devices are suspended.
192 */
193 if (state == PM_SUSPEND_FREEZE) {
194 freeze_enter();
195 goto Platform_wake;
196 }
197
158 if (suspend_test(TEST_PLATFORM)) 198 if (suspend_test(TEST_PLATFORM))
159 goto Platform_wake; 199 goto Platform_wake;
160 200
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
182 enable_nonboot_cpus(); 222 enable_nonboot_cpus();
183 223
184 Platform_wake: 224 Platform_wake:
185 if (suspend_ops->wake) 225 if (need_suspend_ops(state) && suspend_ops->wake)
186 suspend_ops->wake(); 226 suspend_ops->wake();
187 227
188 dpm_resume_start(PMSG_RESUME); 228 dpm_resume_start(PMSG_RESUME);
189 229
190 Platform_finish: 230 Platform_finish:
191 if (suspend_ops->finish) 231 if (need_suspend_ops(state) && suspend_ops->finish)
192 suspend_ops->finish(); 232 suspend_ops->finish();
193 233
194 return error; 234 return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
203 int error; 243 int error;
204 bool wakeup = false; 244 bool wakeup = false;
205 245
206 if (!suspend_ops) 246 if (need_suspend_ops(state) && !suspend_ops)
207 return -ENOSYS; 247 return -ENOSYS;
208 248
209 trace_machine_suspend(state); 249 trace_machine_suspend(state);
210 if (suspend_ops->begin) { 250 if (need_suspend_ops(state) && suspend_ops->begin) {
211 error = suspend_ops->begin(state); 251 error = suspend_ops->begin(state);
212 if (error) 252 if (error)
213 goto Close; 253 goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
226 266
227 do { 267 do {
228 error = suspend_enter(state, &wakeup); 268 error = suspend_enter(state, &wakeup);
229 } while (!error && !wakeup 269 } while (!error && !wakeup && need_suspend_ops(state)
230 && suspend_ops->suspend_again && suspend_ops->suspend_again()); 270 && suspend_ops->suspend_again && suspend_ops->suspend_again());
231 271
232 Resume_devices: 272 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
236 ftrace_start(); 276 ftrace_start();
237 resume_console(); 277 resume_console();
238 Close: 278 Close:
239 if (suspend_ops->end) 279 if (need_suspend_ops(state) && suspend_ops->end)
240 suspend_ops->end(); 280 suspend_ops->end();
241 trace_machine_suspend(PWR_EVENT_EXIT); 281 trace_machine_suspend(PWR_EVENT_EXIT);
242 return error; 282 return error;
243 283
244 Recover_platform: 284 Recover_platform:
245 if (suspend_ops->recover) 285 if (need_suspend_ops(state) && suspend_ops->recover)
246 suspend_ops->recover(); 286 suspend_ops->recover();
247 goto Resume_devices; 287 goto Resume_devices;
248} 288}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
278 if (!mutex_trylock(&pm_mutex)) 318 if (!mutex_trylock(&pm_mutex))
279 return -EBUSY; 319 return -EBUSY;
280 320
321 if (state == PM_SUSPEND_FREEZE)
322 freeze_begin();
323
281 printk(KERN_INFO "PM: Syncing filesystems ... "); 324 printk(KERN_INFO "PM: Syncing filesystems ... ");
282 sys_sync(); 325 sys_sync();
283 printk("done.\n"); 326 printk("done.\n");
284 327
285 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 328 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
286 error = suspend_prepare(); 329 error = suspend_prepare(state);
287 if (error) 330 if (error)
288 goto Unlock; 331 goto Unlock;
289 332
diff --git a/kernel/printk.c b/kernel/printk.c
index 267ce780abe8..f24633afa46a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h> 44#include <linux/poll.h>
45#include <linux/irq_work.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47 48
@@ -1959,30 +1960,32 @@ int is_console_locked(void)
1959static DEFINE_PER_CPU(int, printk_pending); 1960static DEFINE_PER_CPU(int, printk_pending);
1960static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); 1961static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1961 1962
1962void printk_tick(void) 1963static void wake_up_klogd_work_func(struct irq_work *irq_work)
1963{ 1964{
1964 if (__this_cpu_read(printk_pending)) { 1965 int pending = __this_cpu_xchg(printk_pending, 0);
1965 int pending = __this_cpu_xchg(printk_pending, 0); 1966
1966 if (pending & PRINTK_PENDING_SCHED) { 1967 if (pending & PRINTK_PENDING_SCHED) {
1967 char *buf = __get_cpu_var(printk_sched_buf); 1968 char *buf = __get_cpu_var(printk_sched_buf);
1968 printk(KERN_WARNING "[sched_delayed] %s", buf); 1969 printk(KERN_WARNING "[sched_delayed] %s", buf);
1969 }
1970 if (pending & PRINTK_PENDING_WAKEUP)
1971 wake_up_interruptible(&log_wait);
1972 } 1970 }
1973}
1974 1971
1975int printk_needs_cpu(int cpu) 1972 if (pending & PRINTK_PENDING_WAKEUP)
1976{ 1973 wake_up_interruptible(&log_wait);
1977 if (cpu_is_offline(cpu))
1978 printk_tick();
1979 return __this_cpu_read(printk_pending);
1980} 1974}
1981 1975
1976static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
1977 .func = wake_up_klogd_work_func,
1978 .flags = IRQ_WORK_LAZY,
1979};
1980
1982void wake_up_klogd(void) 1981void wake_up_klogd(void)
1983{ 1982{
1984 if (waitqueue_active(&log_wait)) 1983 preempt_disable();
1984 if (waitqueue_active(&log_wait)) {
1985 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1985 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1986 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
1987 }
1988 preempt_enable();
1986} 1989}
1987 1990
1988static void console_cont_flush(char *text, size_t size) 1991static void console_cont_flush(char *text, size_t size)
@@ -2462,6 +2465,7 @@ int printk_sched(const char *fmt, ...)
2462 va_end(args); 2465 va_end(args);
2463 2466
2464 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2467 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
2468 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2465 local_irq_restore(flags); 2469 local_irq_restore(flags);
2466 2470
2467 return r; 2471 return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
39 39
40/* Oprofile timer tick hook */
41static int (*timer_hook)(struct pt_regs *) __read_mostly;
42
43static atomic_t *prof_buffer; 40static atomic_t *prof_buffer;
44static unsigned long prof_len, prof_shift; 41static unsigned long prof_len, prof_shift;
45 42
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
208} 205}
209EXPORT_SYMBOL_GPL(profile_event_unregister); 206EXPORT_SYMBOL_GPL(profile_event_unregister);
210 207
211int register_timer_hook(int (*hook)(struct pt_regs *))
212{
213 if (timer_hook)
214 return -EBUSY;
215 timer_hook = hook;
216 return 0;
217}
218EXPORT_SYMBOL_GPL(register_timer_hook);
219
220void unregister_timer_hook(int (*hook)(struct pt_regs *))
221{
222 WARN_ON(hook != timer_hook);
223 timer_hook = NULL;
224 /* make sure all CPUs see the NULL hook */
225 synchronize_sched(); /* Allow ongoing interrupts to complete. */
226}
227EXPORT_SYMBOL_GPL(unregister_timer_hook);
228
229
230#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
231/* 209/*
232 * Each cpu has a pair of open-addressed hashtables for pending 210 * Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
436{ 414{
437 struct pt_regs *regs = get_irq_regs(); 415 struct pt_regs *regs = get_irq_regs();
438 416
439 if (type == CPU_PROFILING && timer_hook)
440 timer_hook(regs);
441 if (!user_mode(regs) && prof_cpu_mask != NULL && 417 if (!user_mode(regs) && prof_cpu_mask != NULL &&
442 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) 418 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
443 profile_hit(type, (void *)profile_pc(regs)); 419 profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6cbeaae4406d..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
712 kiov->iov_len, kiov->iov_base); 712 kiov->iov_len, kiov->iov_base);
713} 713}
714 714
715/*
716 * This is declared in linux/regset.h and defined in machine-dependent
717 * code. We put the export here, near the primary machine-neutral use,
718 * to ensure no machine forgets it.
719 */
720EXPORT_SYMBOL_GPL(task_user_regset_view);
715#endif 721#endif
716 722
717int ptrace_request(struct task_struct *child, long request, 723int ptrace_request(struct task_struct *child, long request,
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba576c2b..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
111 111
112extern int rcu_expedited; 112extern int rcu_expedited;
113 113
114#ifdef CONFIG_RCU_STALL_COMMON
115
116extern int rcu_cpu_stall_suppress;
117int rcu_jiffies_till_stall_check(void);
118
119#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
120
114#endif /* __LINUX_RCU_H */ 121#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b44..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
405 405
406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) 407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
408 unsigned long secs,
409 unsigned long c_old, unsigned long c)
408{ 410{
409 trace_rcu_torture_read(rcutorturename, rhp); 411 trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
410} 412}
411EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); 413EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
412#else 414#else
413#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) 415#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
416 do { } while (0)
414#endif 417#endif
418
419#ifdef CONFIG_RCU_STALL_COMMON
420
421#ifdef CONFIG_PROVE_RCU
422#define RCU_STALL_DELAY_DELTA (5 * HZ)
423#else
424#define RCU_STALL_DELAY_DELTA 0
425#endif
426
427int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
428int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
429
430module_param(rcu_cpu_stall_suppress, int, 0644);
431module_param(rcu_cpu_stall_timeout, int, 0644);
432
433int rcu_jiffies_till_stall_check(void)
434{
435 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
436
437 /*
438 * Limit check must be consistent with the Kconfig limits
439 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
440 */
441 if (till_stall_check < 3) {
442 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
443 till_stall_check = 3;
444 } else if (till_stall_check > 300) {
445 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
446 till_stall_check = 300;
447 }
448 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
449}
450
451static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
452{
453 rcu_cpu_stall_suppress = 1;
454 return NOTIFY_DONE;
455}
456
457static struct notifier_block rcu_panic_block = {
458 .notifier_call = rcu_panic,
459};
460
461static int __init check_cpu_stall_init(void)
462{
463 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
464 return 0;
465}
466early_initcall(check_cpu_stall_init);
467
468#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2a..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
53 53
54#include "rcutiny_plugin.h"
55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 55
56#include "rcutiny_plugin.h"
57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
193 * interrupts don't count, we must be running at the first interrupt 193 * interrupts don't count, we must be running at the first interrupt
194 * level. 194 * level.
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196static int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 1; 198 return rcu_dynticks_nesting <= 1;
199} 199}
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
205 */ 205 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 207{
208 reset_cpu_stall_ticks(rcp);
208 if (rcp->rcucblist != NULL && 209 if (rcp->rcucblist != NULL &&
209 rcp->donetail != rcp->curtail) { 210 rcp->donetail != rcp->curtail) {
210 rcp->donetail = rcp->curtail; 211 rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
251 */ 252 */
252void rcu_check_callbacks(int cpu, int user) 253void rcu_check_callbacks(int cpu, int user)
253{ 254{
255 check_cpu_stalls();
254 if (user || rcu_is_cpu_rrupt_from_idle()) 256 if (user || rcu_is_cpu_rrupt_from_idle())
255 rcu_sched_qs(cpu); 257 rcu_sched_qs(cpu);
256 else if (!in_softirq()) 258 else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309b..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
34 struct rcu_head **curtail; /* ->next pointer of last CB. */ 34 struct rcu_head **curtail; /* ->next pointer of last CB. */
35 RCU_TRACE(long qlen); /* Number of pending CBs. */ 35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
36 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(char *name); /* Name of RCU type. */
37}; 40};
38 41
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
54EXPORT_SYMBOL_GPL(rcu_scheduler_active); 57EXPORT_SYMBOL_GPL(rcu_scheduler_active);
55#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
56 59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
57#ifdef CONFIG_TINY_PREEMPT_RCU 105#ifdef CONFIG_TINY_PREEMPT_RCU
58 106
59#include <linux/delay.h> 107#include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
448 /* Official start of GP. */ 496 /* Official start of GP. */
449 rcu_preempt_ctrlblk.gpnum++; 497 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); 498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
451 500
452 /* Any blocked RCU readers block new GP. */ 501 /* Any blocked RCU readers block new GP. */
453 if (rcu_preempt_blocked_readers_any()) 502 if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
1054MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1055MODULE_LICENSE("GPL"); 1104MODULE_LICENSE("GPL");
1056 1105
1106static void check_cpu_stall_preempt(void)
1107{
1108#ifdef CONFIG_TINY_PREEMPT_RCU
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
1111}
1112
1057#endif /* #ifdef CONFIG_RCU_TRACE */ 1113#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85fd..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/trace_clock.h>
49#include <asm/byteorder.h> 50#include <asm/byteorder.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
207#define rcu_can_boost() 0 208#define rcu_can_boost() 0
208#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 209#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
209 210
211#ifdef CONFIG_RCU_TRACE
212static u64 notrace rcu_trace_clock_local(void)
213{
214 u64 ts = trace_clock_local();
215 unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
216 return ts;
217}
218#else /* #ifdef CONFIG_RCU_TRACE */
219static u64 notrace rcu_trace_clock_local(void)
220{
221 return 0ULL;
222}
223#endif /* #else #ifdef CONFIG_RCU_TRACE */
224
210static unsigned long shutdown_time; /* jiffies to system shutdown. */ 225static unsigned long shutdown_time; /* jiffies to system shutdown. */
211static unsigned long boost_starttime; /* jiffies of next boost test start. */ 226static unsigned long boost_starttime; /* jiffies of next boost test start. */
212DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 227DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)
845 /* Wait for the next test interval. */ 860 /* Wait for the next test interval. */
846 oldstarttime = boost_starttime; 861 oldstarttime = boost_starttime;
847 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 862 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
848 schedule_timeout_uninterruptible(1); 863 schedule_timeout_interruptible(oldstarttime - jiffies);
849 rcu_stutter_wait("rcu_torture_boost"); 864 rcu_stutter_wait("rcu_torture_boost");
850 if (kthread_should_stop() || 865 if (kthread_should_stop() ||
851 fullstop != FULLSTOP_DONTSTOP) 866 fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)
1028 return; 1043 return;
1029 if (atomic_xchg(&beenhere, 1) != 0) 1044 if (atomic_xchg(&beenhere, 1) != 0)
1030 return; 1045 return;
1031 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
1032 ftrace_dump(DUMP_ALL); 1046 ftrace_dump(DUMP_ALL);
1033} 1047}
1034 1048
@@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
1042{ 1056{
1043 int idx; 1057 int idx;
1044 int completed; 1058 int completed;
1059 int completed_end;
1045 static DEFINE_RCU_RANDOM(rand); 1060 static DEFINE_RCU_RANDOM(rand);
1046 static DEFINE_SPINLOCK(rand_lock); 1061 static DEFINE_SPINLOCK(rand_lock);
1047 struct rcu_torture *p; 1062 struct rcu_torture *p;
1048 int pipe_count; 1063 int pipe_count;
1064 unsigned long long ts;
1049 1065
1050 idx = cur_ops->readlock(); 1066 idx = cur_ops->readlock();
1051 completed = cur_ops->completed(); 1067 completed = cur_ops->completed();
1068 ts = rcu_trace_clock_local();
1052 p = rcu_dereference_check(rcu_torture_current, 1069 p = rcu_dereference_check(rcu_torture_current,
1053 rcu_read_lock_bh_held() || 1070 rcu_read_lock_bh_held() ||
1054 rcu_read_lock_sched_held() || 1071 rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
1058 cur_ops->readunlock(idx); 1075 cur_ops->readunlock(idx);
1059 return; 1076 return;
1060 } 1077 }
1061 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1062 if (p->rtort_mbtest == 0) 1078 if (p->rtort_mbtest == 0)
1063 atomic_inc(&n_rcu_torture_mberror); 1079 atomic_inc(&n_rcu_torture_mberror);
1064 spin_lock(&rand_lock); 1080 spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
1071 /* Should not happen, but... */ 1087 /* Should not happen, but... */
1072 pipe_count = RCU_TORTURE_PIPE_LEN; 1088 pipe_count = RCU_TORTURE_PIPE_LEN;
1073 } 1089 }
1074 if (pipe_count > 1) 1090 completed_end = cur_ops->completed();
1091 if (pipe_count > 1) {
1092 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1093 completed, completed_end);
1075 rcutorture_trace_dump(); 1094 rcutorture_trace_dump();
1095 }
1076 __this_cpu_inc(rcu_torture_count[pipe_count]); 1096 __this_cpu_inc(rcu_torture_count[pipe_count]);
1077 completed = cur_ops->completed() - completed; 1097 completed = completed_end - completed;
1078 if (completed > RCU_TORTURE_PIPE_LEN) { 1098 if (completed > RCU_TORTURE_PIPE_LEN) {
1079 /* Should not happen, but... */ 1099 /* Should not happen, but... */
1080 completed = RCU_TORTURE_PIPE_LEN; 1100 completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@ static int
1094rcu_torture_reader(void *arg) 1114rcu_torture_reader(void *arg)
1095{ 1115{
1096 int completed; 1116 int completed;
1117 int completed_end;
1097 int idx; 1118 int idx;
1098 DEFINE_RCU_RANDOM(rand); 1119 DEFINE_RCU_RANDOM(rand);
1099 struct rcu_torture *p; 1120 struct rcu_torture *p;
1100 int pipe_count; 1121 int pipe_count;
1101 struct timer_list t; 1122 struct timer_list t;
1123 unsigned long long ts;
1102 1124
1103 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 1125 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
1104 set_user_nice(current, 19); 1126 set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)
1112 } 1134 }
1113 idx = cur_ops->readlock(); 1135 idx = cur_ops->readlock();
1114 completed = cur_ops->completed(); 1136 completed = cur_ops->completed();
1137 ts = rcu_trace_clock_local();
1115 p = rcu_dereference_check(rcu_torture_current, 1138 p = rcu_dereference_check(rcu_torture_current,
1116 rcu_read_lock_bh_held() || 1139 rcu_read_lock_bh_held() ||
1117 rcu_read_lock_sched_held() || 1140 rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)
1122 schedule_timeout_interruptible(HZ); 1145 schedule_timeout_interruptible(HZ);
1123 continue; 1146 continue;
1124 } 1147 }
1125 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1126 if (p->rtort_mbtest == 0) 1148 if (p->rtort_mbtest == 0)
1127 atomic_inc(&n_rcu_torture_mberror); 1149 atomic_inc(&n_rcu_torture_mberror);
1128 cur_ops->read_delay(&rand); 1150 cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)
1132 /* Should not happen, but... */ 1154 /* Should not happen, but... */
1133 pipe_count = RCU_TORTURE_PIPE_LEN; 1155 pipe_count = RCU_TORTURE_PIPE_LEN;
1134 } 1156 }
1135 if (pipe_count > 1) 1157 completed_end = cur_ops->completed();
1158 if (pipe_count > 1) {
1159 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1160 ts, completed, completed_end);
1136 rcutorture_trace_dump(); 1161 rcutorture_trace_dump();
1162 }
1137 __this_cpu_inc(rcu_torture_count[pipe_count]); 1163 __this_cpu_inc(rcu_torture_count[pipe_count]);
1138 completed = cur_ops->completed() - completed; 1164 completed = completed_end - completed;
1139 if (completed > RCU_TORTURE_PIPE_LEN) { 1165 if (completed > RCU_TORTURE_PIPE_LEN) {
1140 /* Should not happen, but... */ 1166 /* Should not happen, but... */
1141 completed = RCU_TORTURE_PIPE_LEN; 1167 completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
1301 set_cpus_allowed_ptr(reader_tasks[i], 1327 set_cpus_allowed_ptr(reader_tasks[i],
1302 shuffle_tmp_mask); 1328 shuffle_tmp_mask);
1303 } 1329 }
1304
1305 if (fakewriter_tasks) { 1330 if (fakewriter_tasks) {
1306 for (i = 0; i < nfakewriters; i++) 1331 for (i = 0; i < nfakewriters; i++)
1307 if (fakewriter_tasks[i]) 1332 if (fakewriter_tasks[i])
1308 set_cpus_allowed_ptr(fakewriter_tasks[i], 1333 set_cpus_allowed_ptr(fakewriter_tasks[i],
1309 shuffle_tmp_mask); 1334 shuffle_tmp_mask);
1310 } 1335 }
1311
1312 if (writer_task) 1336 if (writer_task)
1313 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); 1337 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1314
1315 if (stats_task) 1338 if (stats_task)
1316 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); 1339 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1340 if (stutter_task)
1341 set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
1342 if (fqs_task)
1343 set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
1344 if (shutdown_task)
1345 set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
1346#ifdef CONFIG_HOTPLUG_CPU
1347 if (onoff_task)
1348 set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
1349#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1350 if (stall_task)
1351 set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
1352 if (barrier_cbs_tasks)
1353 for (i = 0; i < n_barrier_cbs; i++)
1354 if (barrier_cbs_tasks[i])
1355 set_cpus_allowed_ptr(barrier_cbs_tasks[i],
1356 shuffle_tmp_mask);
1357 if (barrier_task)
1358 set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
1317 1359
1318 if (rcu_idle_cpu == -1) 1360 if (rcu_idle_cpu == -1)
1319 rcu_idle_cpu = num_online_cpus() - 1; 1361 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
1749 barrier_cbs_wq = 1791 barrier_cbs_wq =
1750 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), 1792 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1751 GFP_KERNEL); 1793 GFP_KERNEL);
1752 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) 1794 if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
1753 return -ENOMEM; 1795 return -ENOMEM;
1754 for (i = 0; i < n_barrier_cbs; i++) { 1796 for (i = 0; i < n_barrier_cbs; i++) {
1755 init_waitqueue_head(&barrier_cbs_wq[i]); 1797 init_waitqueue_head(&barrier_cbs_wq[i]);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614e..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
105 * The rcu_scheduler_active variable transitions from zero to one just 105 * The rcu_scheduler_active variable transitions from zero to one just
106 * before the first task is spawned. So when this variable is zero, RCU 106 * before the first task is spawned. So when this variable is zero, RCU
107 * can assume that there is but one task, allowing RCU to (for example) 107 * can assume that there is but one task, allowing RCU to (for example)
108 * optimized synchronize_sched() to a simple barrier(). When this variable 108 * optimize synchronize_sched() to a simple barrier(). When this variable
109 * is one, RCU must actually do all the hard work required to detect real 109 * is one, RCU must actually do all the hard work required to detect real
110 * grace periods. This variable is also used to suppress boot-time false 110 * grace periods. This variable is also used to suppress boot-time false
111 * positives from lockdep-RCU error checking. 111 * positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@ module_param(blimit, long, 0444);
217module_param(qhimark, long, 0444); 217module_param(qhimark, long, 0444);
218module_param(qlowmark, long, 0444); 218module_param(qlowmark, long, 0444);
219 219
220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
222
223module_param(rcu_cpu_stall_suppress, int, 0644);
224module_param(rcu_cpu_stall_timeout, int, 0644);
225
226static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 220static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
227static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
228 222
@@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305} 299}
306 300
307/* 301/*
308 * Does the current CPU require a yet-as-unscheduled grace period? 302 * Does the current CPU require a not-yet-started grace period?
303 * The caller must have disabled interrupts to prevent races with
304 * normal callback registry.
309 */ 305 */
310static int 306static int
311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 307cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
312{ 308{
313 struct rcu_head **ntp; 309 int i;
314 310
315 ntp = rdp->nxttail[RCU_DONE_TAIL + 311 if (rcu_gp_in_progress(rsp))
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)]; 312 return 0; /* No, a grace period is already in progress. */
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && 313 if (!rdp->nxttail[RCU_NEXT_TAIL])
318 !rcu_gp_in_progress(rsp); 314 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
316 return 1; /* Yes, this CPU has newly registered callbacks. */
317 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
318 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
319 ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
320 rdp->nxtcompleted[i]))
321 return 1; /* Yes, CBs for future grace period. */
322 return 0; /* No grace period needed. */
319} 323}
320 324
321/* 325/*
@@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
336static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 340static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
337 bool user) 341 bool user)
338{ 342{
339 trace_rcu_dyntick("Start", oldval, 0); 343 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
340 if (!user && !is_idle_task(current)) { 344 if (!user && !is_idle_task(current)) {
341 struct task_struct *idle = idle_task(smp_processor_id()); 345 struct task_struct *idle = idle_task(smp_processor_id());
342 346
@@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
727 * interrupt from idle, return true. The caller must have at least 731 * interrupt from idle, return true. The caller must have at least
728 * disabled preemption. 732 * disabled preemption.
729 */ 733 */
730int rcu_is_cpu_rrupt_from_idle(void) 734static int rcu_is_cpu_rrupt_from_idle(void)
731{ 735{
732 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 736 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
733} 737}
@@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
793 return 0; 797 return 0;
794} 798}
795 799
796static int jiffies_till_stall_check(void)
797{
798 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
799
800 /*
801 * Limit check must be consistent with the Kconfig limits
802 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
803 */
804 if (till_stall_check < 3) {
805 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
806 till_stall_check = 3;
807 } else if (till_stall_check > 300) {
808 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
809 till_stall_check = 300;
810 }
811 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
812}
813
814static void record_gp_stall_check_time(struct rcu_state *rsp) 800static void record_gp_stall_check_time(struct rcu_state *rsp)
815{ 801{
816 rsp->gp_start = jiffies; 802 rsp->gp_start = jiffies;
817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 803 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
818} 804}
819 805
820/* 806/*
@@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
857 raw_spin_unlock_irqrestore(&rnp->lock, flags); 843 raw_spin_unlock_irqrestore(&rnp->lock, flags);
858 return; 844 return;
859 } 845 }
860 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; 846 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
861 raw_spin_unlock_irqrestore(&rnp->lock, flags); 847 raw_spin_unlock_irqrestore(&rnp->lock, flags);
862 848
863 /* 849 /*
@@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
935 raw_spin_lock_irqsave(&rnp->lock, flags); 921 raw_spin_lock_irqsave(&rnp->lock, flags);
936 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 922 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
937 rsp->jiffies_stall = jiffies + 923 rsp->jiffies_stall = jiffies +
938 3 * jiffies_till_stall_check() + 3; 924 3 * rcu_jiffies_till_stall_check() + 3;
939 raw_spin_unlock_irqrestore(&rnp->lock, flags); 925 raw_spin_unlock_irqrestore(&rnp->lock, flags);
940 926
941 set_need_resched(); /* kick ourselves to get things going. */ 927 set_need_resched(); /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
966 } 952 }
967} 953}
968 954
969static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
970{
971 rcu_cpu_stall_suppress = 1;
972 return NOTIFY_DONE;
973}
974
975/** 955/**
976 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 956 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
977 * 957 *
@@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void)
989 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 969 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
990} 970}
991 971
992static struct notifier_block rcu_panic_block = {
993 .notifier_call = rcu_panic,
994};
995
996static void __init check_cpu_stall_init(void)
997{
998 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
999}
1000
1001/* 972/*
1002 * Update CPU-local rcu_data state to record the newly noticed grace period. 973 * Update CPU-local rcu_data state to record the newly noticed grace period.
1003 * This is used both when we started the grace period and when we notice 974 * This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp)
1071} 1042}
1072 1043
1073/* 1044/*
1045 * Determine the value that ->completed will have at the end of the
1046 * next subsequent grace period. This is used to tag callbacks so that
1047 * a CPU can invoke callbacks in a timely fashion even if that CPU has
1048 * been dyntick-idle for an extended period with callbacks under the
1049 * influence of RCU_FAST_NO_HZ.
1050 *
1051 * The caller must hold rnp->lock with interrupts disabled.
1052 */
1053static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1054 struct rcu_node *rnp)
1055{
1056 /*
1057 * If RCU is idle, we just wait for the next grace period.
1058 * But we can only be sure that RCU is idle if we are looking
1059 * at the root rcu_node structure -- otherwise, a new grace
1060 * period might have started, but just not yet gotten around
1061 * to initializing the current non-root rcu_node structure.
1062 */
1063 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
1064 return rnp->completed + 1;
1065
1066 /*
1067 * Otherwise, wait for a possible partial grace period and
1068 * then the subsequent full grace period.
1069 */
1070 return rnp->completed + 2;
1071}
1072
1073/*
1074 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has
1077 * since proven to be too conservative, which can happen if callbacks get
1078 * assigned a ->completed number while RCU is idle, but with reference to
1079 * a non-root rcu_node structure. This function is idempotent, so it does
1080 * not hurt to call it repeatedly.
1081 *
1082 * The caller must hold rnp->lock with interrupts disabled.
1083 */
1084static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1085 struct rcu_data *rdp)
1086{
1087 unsigned long c;
1088 int i;
1089
1090 /* If the CPU has no callbacks, nothing to do. */
1091 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1092 return;
1093
1094 /*
1095 * Starting from the sublist containing the callbacks most
1096 * recently assigned a ->completed number and working down, find the
1097 * first sublist that is not assignable to an upcoming grace period.
1098 * Such a sublist has something in it (first two tests) and has
1099 * a ->completed number assigned that will complete sooner than
1100 * the ->completed number for newly arrived callbacks (last test).
1101 *
1102 * The key point is that any later sublist can be assigned the
1103 * same ->completed number as the newly arrived callbacks, which
1104 * means that the callbacks in any of these later sublist can be
1105 * grouped into a single sublist, whether or not they have already
1106 * been assigned a ->completed number.
1107 */
1108 c = rcu_cbs_completed(rsp, rnp);
1109 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1110 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1111 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1112 break;
1113
1114 /*
1115 * If there are no sublist for unassigned callbacks, leave.
1116 * At the same time, advance "i" one sublist, so that "i" will
1117 * index into the sublist where all the remaining callbacks should
1118 * be grouped into.
1119 */
1120 if (++i >= RCU_NEXT_TAIL)
1121 return;
1122
1123 /*
1124 * Assign all subsequent callbacks' ->completed number to the next
1125 * full grace period and group them all in the sublist initially
1126 * indexed by "i".
1127 */
1128 for (; i <= RCU_NEXT_TAIL; i++) {
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c;
1131 }
1132
1133 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1135 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
1136 else
1137 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
1138}
1139
1140/*
1141 * Move any callbacks whose grace period has completed to the
1142 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1143 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1144 * sublist. This function is idempotent, so it does not hurt to
1145 * invoke it repeatedly. As long as it is not invoked -too- often...
1146 *
1147 * The caller must hold rnp->lock with interrupts disabled.
1148 */
1149static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1150 struct rcu_data *rdp)
1151{
1152 int i, j;
1153
1154 /* If the CPU has no callbacks, nothing to do. */
1155 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1156 return;
1157
1158 /*
1159 * Find all callbacks whose ->completed numbers indicate that they
1160 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1161 */
1162 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
1163 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1164 break;
1165 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1166 }
1167 /* Clean up any sublist tail pointers that were misordered above. */
1168 for (j = RCU_WAIT_TAIL; j < i; j++)
1169 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1170
1171 /* Copy down callbacks to fill in empty sublists. */
1172 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1173 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1174 break;
1175 rdp->nxttail[j] = rdp->nxttail[i];
1176 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1177 }
1178
1179 /* Classify any remaining callbacks. */
1180 rcu_accelerate_cbs(rsp, rnp, rdp);
1181}
1182
1183/*
1074 * Advance this CPU's callbacks, but only if the current grace period 1184 * Advance this CPU's callbacks, but only if the current grace period
1075 * has ended. This may be called only from the CPU to whom the rdp 1185 * has ended. This may be called only from the CPU to whom the rdp
1076 * belongs. In addition, the corresponding leaf rcu_node structure's 1186 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1080,12 +1190,15 @@ static void
1080__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1190__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1081{ 1191{
1082 /* Did another grace period end? */ 1192 /* Did another grace period end? */
1083 if (rdp->completed != rnp->completed) { 1193 if (rdp->completed == rnp->completed) {
1084 1194
1085 /* Advance callbacks. No harm if list empty. */ 1195 /* No, so just accelerate recent callbacks. */
1086 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; 1196 rcu_accelerate_cbs(rsp, rnp, rdp);
1087 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; 1197
1088 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1198 } else {
1199
1200 /* Advance callbacks. */
1201 rcu_advance_cbs(rsp, rnp, rdp);
1089 1202
1090 /* Remember that we saw this grace-period completion. */ 1203 /* Remember that we saw this grace-period completion. */
1091 rdp->completed = rnp->completed; 1204 rdp->completed = rnp->completed;
@@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1392 /* 1505 /*
1393 * Because there is no grace period in progress right now, 1506 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied 1507 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be 1508 * by the next grace period. So this is a good place to
1396 * handled after the end of the next grace period. If the 1509 * assign a grace period number to recently posted callbacks.
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */ 1510 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406 1512
1407 rsp->gp_flags = RCU_GP_FLAG_INIT; 1513 rsp->gp_flags = RCU_GP_FLAG_INIT;
1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ 1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1527 * This GP can't end until cpu checks in, so all of our 1633 * This GP can't end until cpu checks in, so all of our
1528 * callbacks can be processed during the next GP. 1634 * callbacks can be processed during the next GP.
1529 */ 1635 */
1530 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1636 rcu_accelerate_cbs(rsp, rnp, rdp);
1531 1637
1532 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1638 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1533 } 1639 }
@@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1779 long bl, count, count_lazy; 1885 long bl, count, count_lazy;
1780 int i; 1886 int i;
1781 1887
1782 /* If no callbacks are ready, just return.*/ 1888 /* If no callbacks are ready, just return. */
1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1889 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1784 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 1890 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1785 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1891 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2008 2114
2009 WARN_ON_ONCE(rdp->beenonline == 0); 2115 WARN_ON_ONCE(rdp->beenonline == 0);
2010 2116
2011 /* 2117 /* Handle the end of a grace period that some other CPU ended. */
2012 * Advance callbacks in response to end of earlier grace
2013 * period that some other CPU ended.
2014 */
2015 rcu_process_gp_end(rsp, rdp); 2118 rcu_process_gp_end(rsp, rdp);
2016 2119
2017 /* Update RCU state based on any recent quiescent states. */ 2120 /* Update RCU state based on any recent quiescent states. */
2018 rcu_check_quiescent_state(rsp, rdp); 2121 rcu_check_quiescent_state(rsp, rdp);
2019 2122
2020 /* Does this CPU require a not-yet-started grace period? */ 2123 /* Does this CPU require a not-yet-started grace period? */
2124 local_irq_save(flags);
2021 if (cpu_needs_another_gp(rsp, rdp)) { 2125 if (cpu_needs_another_gp(rsp, rdp)) {
2022 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2023 rcu_start_gp(rsp, flags); /* releases above lock */ 2127 rcu_start_gp(rsp, flags); /* releases above lock */
2128 } else {
2129 local_irq_restore(flags);
2024 } 2130 }
2025 2131
2026 /* If there are callbacks ready, invoke them. */ 2132 /* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2719 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2825 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2720 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2826 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2721 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2827 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2722#ifdef CONFIG_RCU_USER_QS
2723 WARN_ON_ONCE(rdp->dynticks->in_user);
2724#endif
2725 rdp->cpu = cpu; 2828 rdp->cpu = cpu;
2726 rdp->rsp = rsp; 2829 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp); 2830 rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2938 3041
2939 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 3042 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
2940 3043
3044 /* Silence gcc 4.8 warning about array index out of range. */
3045 if (rcu_num_lvls > RCU_NUM_LVLS)
3046 panic("rcu_init_one: rcu_num_lvls overflow");
3047
2941 /* Initialize the level-tracking arrays. */ 3048 /* Initialize the level-tracking arrays. */
2942 3049
2943 for (i = 0; i < rcu_num_lvls; i++) 3050 for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@ void __init rcu_init(void)
3074 cpu_notifier(rcu_cpu_notify, 0); 3181 cpu_notifier(rcu_cpu_notify, 0);
3075 for_each_online_cpu(cpu) 3182 for_each_online_cpu(cpu)
3076 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3183 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3077 check_cpu_stall_init();
3078} 3184}
3079 3185
3080#include "rcutree_plugin.h" 3186#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093d..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
102 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
109}; 105};
110 106
111/* RCU's kthread states for tracing. */ 107/* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@ struct rcu_data {
282 */ 278 */
283 struct rcu_head *nxtlist; 279 struct rcu_head *nxtlist;
284 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 280 struct rcu_head **nxttail[RCU_NEXT_SIZE];
281 unsigned long nxtcompleted[RCU_NEXT_SIZE];
282 /* grace periods for sublists. */
285 long qlen_lazy; /* # of lazy queued callbacks */ 283 long qlen_lazy; /* # of lazy queued callbacks */
286 long qlen; /* # of queued callbacks, incl lazy */ 284 long qlen; /* # of queued callbacks, incl lazy */
287 long qlen_last_fqs_check; 285 long qlen_last_fqs_check;
@@ -343,11 +341,6 @@ struct rcu_data {
343 341
344#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 342#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
345 343
346#ifdef CONFIG_PROVE_RCU
347#define RCU_STALL_DELAY_DELTA (5 * HZ)
348#else
349#define RCU_STALL_DELAY_DELTA 0
350#endif
351#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 344#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
352 /* to take at least one */ 345 /* to take at least one */
353 /* scheduling clock irq */ 346 /* scheduling clock irq */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
17 * See rt.c in preempt-rt for proper credits and further information 17 * See rt.c in preempt-rt for proper credits and further information
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sched/rt.h>
20#include <linux/delay.h> 21#include <linux/delay.h>
21#include <linux/export.h> 22#include <linux/export.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/sched/rt.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
14#include <linux/timer.h> 15#include <linux/timer.h>
15#include <linux/freezer.h> 16#include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h>
16#include <linux/timer.h> 17#include <linux/timer.h>
17 18
18#include "rtmutex_common.h" 19#include "rtmutex_common.h"
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
35 ag->tg->rt_se = NULL; 35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL; 36 ag->tg->rt_rq = NULL;
37#endif 37#endif
38 sched_offline_group(ag->tg);
38 sched_destroy_group(ag->tg); 39 sched_destroy_group(ag->tg);
39} 40}
40 41
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
76 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
77 goto out_free; 78 goto out_free;
78 79
80 sched_online_group(tg, &root_task_group);
81
79 kref_init(&ag->kref); 82 kref_init(&ag->kref);
80 init_rwsem(&ag->lock); 83 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr); 84 ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0bebba..3a673a3b0c6b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
83#endif 83#endif
84 84
85#include "sched.h" 85#include "sched.h"
86#include "../workqueue_sched.h" 86#include "../workqueue_internal.h"
87#include "../smpboot.h" 87#include "../smpboot.h"
88 88
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
@@ -4371,7 +4371,7 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
4371 struct task_struct *curr = current; 4371 struct task_struct *curr = current;
4372 struct rq *rq, *p_rq; 4372 struct rq *rq, *p_rq;
4373 unsigned long flags; 4373 unsigned long flags;
4374 bool yielded = 0; 4374 int yielded = 0;
4375 4375
4376 local_irq_save(flags); 4376 local_irq_save(flags);
4377 rq = this_rq(); 4377 rq = this_rq();
@@ -4667,6 +4667,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4667 */ 4667 */
4668 idle->sched_class = &idle_sched_class; 4668 idle->sched_class = &idle_sched_class;
4669 ftrace_graph_init_idle_task(idle, cpu); 4669 ftrace_graph_init_idle_task(idle, cpu);
4670 vtime_init_idle(idle);
4670#if defined(CONFIG_SMP) 4671#if defined(CONFIG_SMP)
4671 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4672 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4672#endif 4673#endif
@@ -7160,7 +7161,6 @@ static void free_sched_group(struct task_group *tg)
7160struct task_group *sched_create_group(struct task_group *parent) 7161struct task_group *sched_create_group(struct task_group *parent)
7161{ 7162{
7162 struct task_group *tg; 7163 struct task_group *tg;
7163 unsigned long flags;
7164 7164
7165 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7165 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7166 if (!tg) 7166 if (!tg)
@@ -7172,6 +7172,17 @@ struct task_group *sched_create_group(struct task_group *parent)
7172 if (!alloc_rt_sched_group(tg, parent)) 7172 if (!alloc_rt_sched_group(tg, parent))
7173 goto err; 7173 goto err;
7174 7174
7175 return tg;
7176
7177err:
7178 free_sched_group(tg);
7179 return ERR_PTR(-ENOMEM);
7180}
7181
7182void sched_online_group(struct task_group *tg, struct task_group *parent)
7183{
7184 unsigned long flags;
7185
7175 spin_lock_irqsave(&task_group_lock, flags); 7186 spin_lock_irqsave(&task_group_lock, flags);
7176 list_add_rcu(&tg->list, &task_groups); 7187 list_add_rcu(&tg->list, &task_groups);
7177 7188
@@ -7181,12 +7192,6 @@ struct task_group *sched_create_group(struct task_group *parent)
7181 INIT_LIST_HEAD(&tg->children); 7192 INIT_LIST_HEAD(&tg->children);
7182 list_add_rcu(&tg->siblings, &parent->children); 7193 list_add_rcu(&tg->siblings, &parent->children);
7183 spin_unlock_irqrestore(&task_group_lock, flags); 7194 spin_unlock_irqrestore(&task_group_lock, flags);
7184
7185 return tg;
7186
7187err:
7188 free_sched_group(tg);
7189 return ERR_PTR(-ENOMEM);
7190} 7195}
7191 7196
7192/* rcu callback to free various structures associated with a task group */ 7197/* rcu callback to free various structures associated with a task group */
@@ -7199,6 +7204,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7199/* Destroy runqueue etc associated with a task group */ 7204/* Destroy runqueue etc associated with a task group */
7200void sched_destroy_group(struct task_group *tg) 7205void sched_destroy_group(struct task_group *tg)
7201{ 7206{
7207 /* wait for possible concurrent references to cfs_rqs complete */
7208 call_rcu(&tg->rcu, free_sched_group_rcu);
7209}
7210
7211void sched_offline_group(struct task_group *tg)
7212{
7202 unsigned long flags; 7213 unsigned long flags;
7203 int i; 7214 int i;
7204 7215
@@ -7210,9 +7221,6 @@ void sched_destroy_group(struct task_group *tg)
7210 list_del_rcu(&tg->list); 7221 list_del_rcu(&tg->list);
7211 list_del_rcu(&tg->siblings); 7222 list_del_rcu(&tg->siblings);
7212 spin_unlock_irqrestore(&task_group_lock, flags); 7223 spin_unlock_irqrestore(&task_group_lock, flags);
7213
7214 /* wait for possible concurrent references to cfs_rqs complete */
7215 call_rcu(&tg->rcu, free_sched_group_rcu);
7216} 7224}
7217 7225
7218/* change task's runqueue when it moves between groups. 7226/* change task's runqueue when it moves between groups.
@@ -7508,6 +7516,25 @@ static int sched_rt_global_constraints(void)
7508} 7516}
7509#endif /* CONFIG_RT_GROUP_SCHED */ 7517#endif /* CONFIG_RT_GROUP_SCHED */
7510 7518
7519int sched_rr_handler(struct ctl_table *table, int write,
7520 void __user *buffer, size_t *lenp,
7521 loff_t *ppos)
7522{
7523 int ret;
7524 static DEFINE_MUTEX(mutex);
7525
7526 mutex_lock(&mutex);
7527 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7528 /* make sure that internally we keep jiffies */
7529 /* also, writing zero resets timeslice to default */
7530 if (!ret && write) {
7531 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7532 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7533 }
7534 mutex_unlock(&mutex);
7535 return ret;
7536}
7537
7511int sched_rt_handler(struct ctl_table *table, int write, 7538int sched_rt_handler(struct ctl_table *table, int write,
7512 void __user *buffer, size_t *lenp, 7539 void __user *buffer, size_t *lenp,
7513 loff_t *ppos) 7540 loff_t *ppos)
@@ -7564,6 +7591,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7564 return &tg->css; 7591 return &tg->css;
7565} 7592}
7566 7593
7594static int cpu_cgroup_css_online(struct cgroup *cgrp)
7595{
7596 struct task_group *tg = cgroup_tg(cgrp);
7597 struct task_group *parent;
7598
7599 if (!cgrp->parent)
7600 return 0;
7601
7602 parent = cgroup_tg(cgrp->parent);
7603 sched_online_group(tg, parent);
7604 return 0;
7605}
7606
7567static void cpu_cgroup_css_free(struct cgroup *cgrp) 7607static void cpu_cgroup_css_free(struct cgroup *cgrp)
7568{ 7608{
7569 struct task_group *tg = cgroup_tg(cgrp); 7609 struct task_group *tg = cgroup_tg(cgrp);
@@ -7571,6 +7611,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
7571 sched_destroy_group(tg); 7611 sched_destroy_group(tg);
7572} 7612}
7573 7613
7614static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7615{
7616 struct task_group *tg = cgroup_tg(cgrp);
7617
7618 sched_offline_group(tg);
7619}
7620
7574static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7621static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7575 struct cgroup_taskset *tset) 7622 struct cgroup_taskset *tset)
7576{ 7623{
@@ -7926,6 +7973,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7926 .name = "cpu", 7973 .name = "cpu",
7927 .css_alloc = cpu_cgroup_css_alloc, 7974 .css_alloc = cpu_cgroup_css_alloc,
7928 .css_free = cpu_cgroup_css_free, 7975 .css_free = cpu_cgroup_css_free,
7976 .css_online = cpu_cgroup_css_online,
7977 .css_offline = cpu_cgroup_css_offline,
7929 .can_attach = cpu_cgroup_can_attach, 7978 .can_attach = cpu_cgroup_can_attach,
7930 .attach = cpu_cgroup_attach, 7979 .attach = cpu_cgroup_attach,
7931 .exit = cpu_cgroup_exit, 7980 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
31#include "cpupri.h" 33#include "cpupri.h"
32 34
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 35/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..9857329ed280 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
3#include <linux/tsacct_kern.h> 3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h> 4#include <linux/kernel_stat.h>
5#include <linux/static_key.h> 5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
6#include "sched.h" 7#include "sched.h"
7 8
8 9
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
163 task_group_account_field(p, index, (__force u64) cputime); 164 task_group_account_field(p, index, (__force u64) cputime);
164 165
165 /* Account for user time used */ 166 /* Account for user time used */
166 acct_update_integrals(p); 167 acct_account_cputime(p);
167} 168}
168 169
169/* 170/*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
213 task_group_account_field(p, index, (__force u64) cputime); 214 task_group_account_field(p, index, (__force u64) cputime);
214 215
215 /* Account for system time used */ 216 /* Account for system time used */
216 acct_update_integrals(p); 217 acct_account_cputime(p);
217} 218}
218 219
219/* 220/*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 296void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{ 297{
297 struct signal_struct *sig = tsk->signal; 298 struct signal_struct *sig = tsk->signal;
299 cputime_t utime, stime;
298 struct task_struct *t; 300 struct task_struct *t;
299 301
300 times->utime = sig->utime; 302 times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
308 310
309 t = tsk; 311 t = tsk;
310 do { 312 do {
311 times->utime += t->utime; 313 task_cputime(tsk, &utime, &stime);
312 times->stime += t->stime; 314 times->utime += utime;
315 times->stime += stime;
313 times->sum_exec_runtime += task_sched_runtime(t); 316 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t); 317 } while_each_thread(tsk, t);
315out: 318out:
316 rcu_read_unlock(); 319 rcu_read_unlock();
317} 320}
318 321
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING 322#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/* 323/*
323 * Account a tick to a process and cpustat 324 * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
382 irqtime_account_process_tick(current, 0, rq); 383 irqtime_account_process_tick(current, 0, rq);
383} 384}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 385#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {} 386static inline void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 387static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {} 388 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389 390
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
390/* 392/*
391 * Account a single tick of cpu time. 393 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to 394 * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq(); 400 struct rq *rq = this_rq();
399 401
402 if (vtime_accounting_enabled())
403 return;
404
400 if (sched_clock_irqtime) { 405 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq); 406 irqtime_account_process_tick(p, user_tick, rq);
402 return; 407 return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
438 443
439 account_idle_time(jiffies_to_cputime(ticks)); 444 account_idle_time(jiffies_to_cputime(ticks));
440} 445}
441 446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
442#endif
443 447
444/* 448/*
445 * Use precise platform statistics if available: 449 * Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
461 *st = cputime.stime; 465 *st = cputime.stime;
462} 466}
463 467
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev) 469void vtime_task_switch(struct task_struct *prev)
476{ 470{
471 if (!vtime_accounting_enabled())
472 return;
473
477 if (is_idle_task(prev)) 474 if (is_idle_task(prev))
478 vtime_account_idle(prev); 475 vtime_account_idle(prev);
479 else 476 else
480 vtime_account_system(prev); 477 vtime_account_system(prev);
481 478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
482 vtime_account_user(prev); 480 vtime_account_user(prev);
481#endif
483 arch_vtime_task_switch(prev); 482 arch_vtime_task_switch(prev);
484} 483}
485#endif 484#endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
493 * vtime_account(). 492 * vtime_account().
494 */ 493 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT 494#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk) 495void vtime_account_irq_enter(struct task_struct *tsk)
497{ 496{
498 if (in_interrupt() || !is_idle_task(tsk)) 497 if (!vtime_accounting_enabled())
499 vtime_account_system(tsk); 498 return;
500 else 499
501 vtime_account_idle(tsk); 500 if (!in_interrupt()) {
501 /*
502 * If we interrupted user, context_tracking_in_user()
503 * is 1 because the context tracking don't hook
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 }
511
512 if (is_idle_task(tsk)) {
513 vtime_account_idle(tsk);
514 return;
515 }
516 }
517 vtime_account_system(tsk);
502} 518}
503EXPORT_SYMBOL_GPL(vtime_account); 519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505 521
506#else 522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511 523
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) 524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
513{ 525{
514 u64 temp = (__force u64) rtime; 526 u64 temp = (__force u64) rtime;
515 527
516 temp *= (__force u64) utime; 528 temp *= (__force u64) stime;
517 529
518 if (sizeof(cputime_t) == 4) 530 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total); 531 temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev, 543 struct cputime *prev,
532 cputime_t *ut, cputime_t *st) 544 cputime_t *ut, cputime_t *st)
533{ 545{
534 cputime_t rtime, utime, total; 546 cputime_t rtime, stime, total;
535 547
536 utime = curr->utime; 548 stime = curr->stime;
537 total = utime + curr->stime; 549 total = stime + curr->utime;
538 550
539 /* 551 /*
540 * Tick based cputime accounting depend on random scheduling 552 * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 561 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550 562
551 if (total) 563 if (total)
552 utime = scale_utime(utime, rtime, total); 564 stime = scale_stime(stime, rtime, total);
553 else 565 else
554 utime = rtime; 566 stime = rtime;
555 567
556 /* 568 /*
557 * If the tick based count grows faster than the scheduler one, 569 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward. 570 * the result of the scaling may go backward.
559 * Let's enforce monotonicity. 571 * Let's enforce monotonicity.
560 */ 572 */
561 prev->utime = max(prev->utime, utime); 573 prev->stime = max(prev->stime, stime);
562 prev->stime = max(prev->stime, rtime - prev->utime); 574 prev->utime = max(prev->utime, rtime - prev->stime);
563 575
564 *ut = prev->utime; 576 *ut = prev->utime;
565 *st = prev->stime; 577 *st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 580void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{ 581{
570 struct task_cputime cputime = { 582 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime, 583 .sum_exec_runtime = p->se.sum_exec_runtime,
574 }; 584 };
575 585
586 task_cputime(p, &cputime.utime, &cputime.stime);
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 587 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577} 588}
578 589
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
586 thread_group_cputime(p, &cputime); 597 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588} 599}
589#endif 600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
601
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk)
604{
605 unsigned long long clock;
606
607 clock = sched_clock();
608 if (clock < tsk->vtime_snap)
609 return 0;
610
611 return clock - tsk->vtime_snap;
612}
613
614static cputime_t get_vtime_delta(struct task_struct *tsk)
615{
616 unsigned long long delta = vtime_delta(tsk);
617
618 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
619 tsk->vtime_snap += delta;
620
621 /* CHECKME: always safe to convert nsecs to cputime? */
622 return nsecs_to_cputime(delta);
623}
624
625static void __vtime_account_system(struct task_struct *tsk)
626{
627 cputime_t delta_cpu = get_vtime_delta(tsk);
628
629 account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
630}
631
632void vtime_account_system(struct task_struct *tsk)
633{
634 if (!vtime_accounting_enabled())
635 return;
636
637 write_seqlock(&tsk->vtime_seqlock);
638 __vtime_account_system(tsk);
639 write_sequnlock(&tsk->vtime_seqlock);
640}
641
642void vtime_account_irq_exit(struct task_struct *tsk)
643{
644 if (!vtime_accounting_enabled())
645 return;
646
647 write_seqlock(&tsk->vtime_seqlock);
648 if (context_tracking_in_user())
649 tsk->vtime_snap_whence = VTIME_USER;
650 __vtime_account_system(tsk);
651 write_sequnlock(&tsk->vtime_seqlock);
652}
653
654void vtime_account_user(struct task_struct *tsk)
655{
656 cputime_t delta_cpu;
657
658 if (!vtime_accounting_enabled())
659 return;
660
661 delta_cpu = get_vtime_delta(tsk);
662
663 write_seqlock(&tsk->vtime_seqlock);
664 tsk->vtime_snap_whence = VTIME_SYS;
665 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
666 write_sequnlock(&tsk->vtime_seqlock);
667}
668
669void vtime_user_enter(struct task_struct *tsk)
670{
671 if (!vtime_accounting_enabled())
672 return;
673
674 write_seqlock(&tsk->vtime_seqlock);
675 tsk->vtime_snap_whence = VTIME_USER;
676 __vtime_account_system(tsk);
677 write_sequnlock(&tsk->vtime_seqlock);
678}
679
680void vtime_guest_enter(struct task_struct *tsk)
681{
682 write_seqlock(&tsk->vtime_seqlock);
683 __vtime_account_system(tsk);
684 current->flags |= PF_VCPU;
685 write_sequnlock(&tsk->vtime_seqlock);
686}
687
688void vtime_guest_exit(struct task_struct *tsk)
689{
690 write_seqlock(&tsk->vtime_seqlock);
691 __vtime_account_system(tsk);
692 current->flags &= ~PF_VCPU;
693 write_sequnlock(&tsk->vtime_seqlock);
694}
695
696void vtime_account_idle(struct task_struct *tsk)
697{
698 cputime_t delta_cpu = get_vtime_delta(tsk);
699
700 account_idle_time(delta_cpu);
701}
702
703bool vtime_accounting_enabled(void)
704{
705 return context_tracking_active();
706}
707
708void arch_vtime_task_switch(struct task_struct *prev)
709{
710 write_seqlock(&prev->vtime_seqlock);
711 prev->vtime_snap_whence = VTIME_SLEEPING;
712 write_sequnlock(&prev->vtime_seqlock);
713
714 write_seqlock(&current->vtime_seqlock);
715 current->vtime_snap_whence = VTIME_SYS;
716 current->vtime_snap = sched_clock();
717 write_sequnlock(&current->vtime_seqlock);
718}
719
720void vtime_init_idle(struct task_struct *t)
721{
722 unsigned long flags;
723
724 write_seqlock_irqsave(&t->vtime_seqlock, flags);
725 t->vtime_snap_whence = VTIME_SYS;
726 t->vtime_snap = sched_clock();
727 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
728}
729
730cputime_t task_gtime(struct task_struct *t)
731{
732 unsigned int seq;
733 cputime_t gtime;
734
735 do {
736 seq = read_seqbegin(&t->vtime_seqlock);
737
738 gtime = t->gtime;
739 if (t->flags & PF_VCPU)
740 gtime += vtime_delta(t);
741
742 } while (read_seqretry(&t->vtime_seqlock, seq));
743
744 return gtime;
745}
746
747/*
748 * Fetch cputime raw values from fields of task_struct and
749 * add up the pending nohz execution time since the last
750 * cputime snapshot.
751 */
752static void
753fetch_task_cputime(struct task_struct *t,
754 cputime_t *u_dst, cputime_t *s_dst,
755 cputime_t *u_src, cputime_t *s_src,
756 cputime_t *udelta, cputime_t *sdelta)
757{
758 unsigned int seq;
759 unsigned long long delta;
760
761 do {
762 *udelta = 0;
763 *sdelta = 0;
764
765 seq = read_seqbegin(&t->vtime_seqlock);
766
767 if (u_dst)
768 *u_dst = *u_src;
769 if (s_dst)
770 *s_dst = *s_src;
771
772 /* Task is sleeping, nothing to add */
773 if (t->vtime_snap_whence == VTIME_SLEEPING ||
774 is_idle_task(t))
775 continue;
776
777 delta = vtime_delta(t);
778
779 /*
780 * Task runs either in user or kernel space, add pending nohz time to
781 * the right place.
782 */
783 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
784 *udelta = delta;
785 } else {
786 if (t->vtime_snap_whence == VTIME_SYS)
787 *sdelta = delta;
788 }
789 } while (read_seqretry(&t->vtime_seqlock, seq));
790}
791
792
793void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
794{
795 cputime_t udelta, sdelta;
796
797 fetch_task_cputime(t, utime, stime, &t->utime,
798 &t->stime, &udelta, &sdelta);
799 if (utime)
800 *utime += udelta;
801 if (stime)
802 *stime += sdelta;
803}
804
805void task_cputime_scaled(struct task_struct *t,
806 cputime_t *utimescaled, cputime_t *stimescaled)
807{
808 cputime_t udelta, sdelta;
809
810 fetch_task_cputime(t, utimescaled, stimescaled,
811 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
812 if (utimescaled)
813 *utimescaled += cputime_to_scaled(udelta);
814 if (stimescaled)
815 *stimescaled += cputime_to_scaled(sdelta);
816}
817#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..557e7b53b323 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
110 if (autogroup_path(tg, group_path, PATH_MAX)) 110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path; 111 return group_path;
112 112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 113 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path; 114 return group_path;
122} 115}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa53643409..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1680 } 1680 }
1681 1681
1682 /* ensure we never gain time by being placed backwards. */ 1682 /* ensure we never gain time by being placed backwards. */
1683 vruntime = max_vruntime(se->vruntime, vruntime); 1683 se->vruntime = max_vruntime(se->vruntime, vruntime);
1684
1685 se->vruntime = vruntime;
1686} 1684}
1687 1685
1688static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 1686static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3254 */ 3252 */
3255static int select_idle_sibling(struct task_struct *p, int target) 3253static int select_idle_sibling(struct task_struct *p, int target)
3256{ 3254{
3257 int cpu = smp_processor_id();
3258 int prev_cpu = task_cpu(p);
3259 struct sched_domain *sd; 3255 struct sched_domain *sd;
3260 struct sched_group *sg; 3256 struct sched_group *sg;
3261 int i; 3257 int i = task_cpu(p);
3262 3258
3263 /* 3259 if (idle_cpu(target))
3264 * If the task is going to be woken-up on this cpu and if it is 3260 return target;
3265 * already idle, then it is the right target.
3266 */
3267 if (target == cpu && idle_cpu(cpu))
3268 return cpu;
3269 3261
3270 /* 3262 /*
3271 * If the task is going to be woken-up on the cpu where it previously 3263 * If the prevous cpu is cache affine and idle, don't be stupid.
3272 * ran and if it is currently idle, then it the right target.
3273 */ 3264 */
3274 if (target == prev_cpu && idle_cpu(prev_cpu)) 3265 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3275 return prev_cpu; 3266 return i;
3276 3267
3277 /* 3268 /*
3278 * Otherwise, iterate the domains and find an elegible idle cpu. 3269 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
3286 goto next; 3277 goto next;
3287 3278
3288 for_each_cpu(i, sched_group_cpus(sg)) { 3279 for_each_cpu(i, sched_group_cpus(sg)) {
3289 if (!idle_cpu(i)) 3280 if (i == target || !idle_cpu(i))
3290 goto next; 3281 goto next;
3291 } 3282 }
3292 3283
@@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
6101 * idle runqueue: 6092 * idle runqueue:
6102 */ 6093 */
6103 if (rq->cfs.load.weight) 6094 if (rq->cfs.load.weight)
6104 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 6095 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6105 6096
6106 return rr_interval; 6097 return rr_interval;
6107} 6098}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b2847357..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10int sched_rr_timeslice = RR_TIMESLICE;
11
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 12static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11 13
12struct rt_bandwidth def_rt_bandwidth; 14struct rt_bandwidth def_rt_bandwidth;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
925 return; 927 return;
926 928
927 delta_exec = rq->clock_task - curr->se.exec_start; 929 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0)) 930 if (unlikely((s64)delta_exec <= 0))
929 delta_exec = 0; 931 return;
930 932
931 schedstat_set(curr->se.statistics.exec_max, 933 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec)); 934 max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1429static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{ 1430{
1429 if (!task_running(rq, p) && 1431 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1432 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1431 (p->nr_cpus_allowed > 1))
1432 return 1; 1433 return 1;
1433 return 0; 1434 return 0;
1434} 1435}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1889 * we may need to handle the pulling of RT tasks 1890 * we may need to handle the pulling of RT tasks
1890 * now. 1891 * now.
1891 */ 1892 */
1892 if (p->on_rq && !rq->rt.rt_nr_running) 1893 if (!p->on_rq || rq->rt.rt_nr_running)
1893 pull_rt_task(rq); 1894 return;
1895
1896 if (pull_rt_task(rq))
1897 resched_task(rq->curr);
1894} 1898}
1895 1899
1896void init_sched_rt_class(void) 1900void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1985 if (soft != RLIM_INFINITY) { 1989 if (soft != RLIM_INFINITY) {
1986 unsigned long next; 1990 unsigned long next;
1987 1991
1988 p->rt.timeout++; 1992 if (p->rt.watchdog_stamp != jiffies) {
1993 p->rt.timeout++;
1994 p->rt.watchdog_stamp = jiffies;
1995 }
1996
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1997 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next) 1998 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 1999 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2010 if (--p->rt.time_slice) 2018 if (--p->rt.time_slice)
2011 return; 2019 return;
2012 2020
2013 p->rt.time_slice = RR_TIMESLICE; 2021 p->rt.time_slice = sched_rr_timeslice;
2014 2022
2015 /* 2023 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the 2024 * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2041 * Time slice is 0 for SCHED_FIFO tasks 2049 * Time slice is 0 for SCHED_FIFO tasks
2042 */ 2050 */
2043 if (task->policy == SCHED_RR) 2051 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE; 2052 return sched_rr_timeslice;
2045 else 2053 else
2046 return 0; 2054 return 0;
2047} 2055}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
1 1
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
4#include <linux/spinlock.h> 6#include <linux/spinlock.h>
5#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 3d09cf6cde75..7f82adbad480 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1632,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1632 unsigned long flags; 1632 unsigned long flags;
1633 struct sighand_struct *psig; 1633 struct sighand_struct *psig;
1634 bool autoreap = false; 1634 bool autoreap = false;
1635 cputime_t utime, stime;
1635 1636
1636 BUG_ON(sig == -1); 1637 BUG_ON(sig == -1);
1637 1638
@@ -1669,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1669 task_uid(tsk)); 1670 task_uid(tsk));
1670 rcu_read_unlock(); 1671 rcu_read_unlock();
1671 1672
1672 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1673 task_cputime(tsk, &utime, &stime);
1673 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); 1674 info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
1675 info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
1674 1676
1675 info.si_status = tsk->exit_code & 0x7f; 1677 info.si_status = tsk->exit_code & 0x7f;
1676 if (tsk->exit_code & 0x80) 1678 if (tsk->exit_code & 0x80)
@@ -1734,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1734 unsigned long flags; 1736 unsigned long flags;
1735 struct task_struct *parent; 1737 struct task_struct *parent;
1736 struct sighand_struct *sighand; 1738 struct sighand_struct *sighand;
1739 cputime_t utime, stime;
1737 1740
1738 if (for_ptracer) { 1741 if (for_ptracer) {
1739 parent = tsk->parent; 1742 parent = tsk->parent;
@@ -1752,8 +1755,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1752 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1755 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1753 rcu_read_unlock(); 1756 rcu_read_unlock();
1754 1757
1755 info.si_utime = cputime_to_clock_t(tsk->utime); 1758 task_cputime(tsk, &utime, &stime);
1756 info.si_stime = cputime_to_clock_t(tsk->stime); 1759 info.si_utime = cputime_to_clock_t(utime);
1760 info.si_stime = cputime_to_clock_t(stime);
1757 1761
1758 info.si_code = why; 1762 info.si_code = why;
1759 switch (why) { 1763 switch (why) {
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..d4abac261779 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
183 kfree(td); 183 kfree(td);
184 return PTR_ERR(tsk); 184 return PTR_ERR(tsk);
185 } 185 }
186
187 get_task_struct(tsk); 186 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create)
189 ht->create(cpu);
189 return 0; 190 return 0;
190} 191}
191 192
@@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{ 226{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 227 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227 228
228 if (tsk) 229 if (tsk && !ht->selfparking)
229 kthread_park(tsk); 230 kthread_park(tsk);
230} 231}
231 232
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe789..b4d252fd195b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 198 * We restart softirq processing for at most 2 ms,
199 * and we fall back to softirqd after that. 199 * and if need_resched() is not set.
200 * 200 *
201 * This number has been established via experimentation. 201 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 202 * The two things to balance is latency against fairness -
203 * we want to handle softirqs as soon as possible, but they 203 * we want to handle softirqs as soon as possible, but they
204 * should not be able to lock up the box. 204 * should not be able to lock up the box.
205 */ 205 */
206#define MAX_SOFTIRQ_RESTART 10 206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
207 207
208asmlinkage void __do_softirq(void) 208asmlinkage void __do_softirq(void)
209{ 209{
210 struct softirq_action *h; 210 struct softirq_action *h;
211 __u32 pending; 211 __u32 pending;
212 int max_restart = MAX_SOFTIRQ_RESTART; 212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 213 int cpu;
214 unsigned long old_flags = current->flags; 214 unsigned long old_flags = current->flags;
215 215
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 vtime_account_irq_enter(current); 224 account_irq_enter_time(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
264 local_irq_disable(); 264 local_irq_disable();
265 265
266 pending = local_softirq_pending(); 266 pending = local_softirq_pending();
267 if (pending && --max_restart) 267 if (pending) {
268 goto restart; 268 if (time_before(jiffies, end) && !need_resched())
269 goto restart;
269 270
270 if (pending)
271 wakeup_softirqd(); 271 wakeup_softirqd();
272 }
272 273
273 lockdep_softirq_exit(); 274 lockdep_softirq_exit();
274 275
275 vtime_account_irq_exit(current); 276 account_irq_exit_time(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 277 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 278 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 279}
@@ -341,7 +342,7 @@ static inline void invoke_softirq(void)
341 */ 342 */
342void irq_exit(void) 343void irq_exit(void)
343{ 344{
344 vtime_account_irq_exit(current); 345 account_irq_exit_time(current);
345 trace_hardirq_exit(); 346 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 347 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 348 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
282 */ 282 */
283void cleanup_srcu_struct(struct srcu_struct *sp) 283void cleanup_srcu_struct(struct srcu_struct *sp)
284{ 284{
285 int sum; 285 if (WARN_ON(srcu_readers_active(sp)))
286 286 return; /* Leakage unless caller handles error. */
287 sum = srcu_readers_active(sp);
288 WARN_ON(sum); /* Leakage unless caller handles error. */
289 if (sum != 0)
290 return;
291 free_percpu(sp->per_cpu_ref); 287 free_percpu(sp->per_cpu_ref);
292 sp->per_cpu_ref = NULL; 288 sp->per_cpu_ref = NULL;
293} 289}
@@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
302{ 298{
303 int idx; 299 int idx;
304 300
301 idx = ACCESS_ONCE(sp->completed) & 0x1;
305 preempt_disable(); 302 preempt_disable();
306 idx = rcu_dereference_index_check(sp->completed,
307 rcu_read_lock_sched_held()) & 0x1;
308 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 303 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
309 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 304 smp_mb(); /* B */ /* Avoid leaking the critical section. */
310 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
321 */ 316 */
322void __srcu_read_unlock(struct srcu_struct *sp, int idx) 317void __srcu_read_unlock(struct srcu_struct *sp, int idx)
323{ 318{
324 preempt_disable();
325 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 319 smp_mb(); /* C */ /* Avoid leaking the critical section. */
326 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; 320 this_cpu_dec(sp->per_cpu_ref->c[idx]);
327 preempt_enable();
328} 321}
329EXPORT_SYMBOL_GPL(__srcu_read_unlock); 322EXPORT_SYMBOL_GPL(__srcu_read_unlock);
330 323
@@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
423 !lock_is_held(&rcu_sched_lock_map), 416 !lock_is_held(&rcu_sched_lock_map),
424 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 417 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
425 418
419 might_sleep();
426 init_completion(&rcu.completion); 420 init_completion(&rcu.completion);
427 421
428 head->next = NULL; 422 head->next = NULL;
@@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
455 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 449 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
456 * @sp: srcu_struct with which to synchronize. 450 * @sp: srcu_struct with which to synchronize.
457 * 451 *
458 * Flip the completed counter, and wait for the old count to drain to zero. 452 * Wait for the count to drain to zero of both indexes. To avoid the
459 * As with classic RCU, the updater must use some separate means of 453 * possible starvation of synchronize_srcu(), it waits for the count of
460 * synchronizing concurrent updates. Can block; must be called from 454 * the index=((->completed & 1) ^ 1) to drain to zero at first,
461 * process context. 455 * and then flip the completed and wait for the count of the other index.
456 *
457 * Can block; must be called from process context.
462 * 458 *
463 * Note that it is illegal to call synchronize_srcu() from the corresponding 459 * Note that it is illegal to call synchronize_srcu() from the corresponding
464 * SRCU read-side critical section; doing so will result in deadlock. 460 * SRCU read-side critical section; doing so will result in deadlock.
@@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
480 * Wait for an SRCU grace period to elapse, but be more aggressive about 476 * Wait for an SRCU grace period to elapse, but be more aggressive about
481 * spinning rather than blocking when waiting. 477 * spinning rather than blocking when waiting.
482 * 478 *
483 * Note that it is illegal to call this function while holding any lock 479 * Note that it is also illegal to call synchronize_srcu_expedited()
484 * that is acquired by a CPU-hotplug notifier. It is also illegal to call 480 * from the corresponding SRCU read-side critical section;
485 * synchronize_srcu_expedited() from the corresponding SRCU read-side 481 * doing so will result in deadlock. However, it is perfectly legal
486 * critical section; doing so will result in deadlock. However, it is 482 * to call synchronize_srcu_expedited() on one srcu_struct from some
487 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 483 * other srcu_struct's read-side critical section, as long as
488 * from some other srcu_struct's read-side critical section, as long as
489 * the resulting graph of srcu_structs is acyclic. 484 * the resulting graph of srcu_structs is acyclic.
490 */ 485 */
491void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..95d178c62d5a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
@@ -37,10 +37,10 @@ struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */ 38 bool enabled; /* is this stopper enabled? */
39 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
40 struct task_struct *thread; /* stopper thread */
41}; 40};
42 41
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 42static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 44static bool stop_machine_initialized = false;
45 45
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
62} 62}
63 63
64/* queue @work to @stopper. if offline, @work is completed immediately */ 64/* queue @work to @stopper. if offline, @work is completed immediately */
65static void cpu_stop_queue_work(struct cpu_stopper *stopper, 65static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
66 struct cpu_stop_work *work)
67{ 66{
67 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
68 struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
69
68 unsigned long flags; 70 unsigned long flags;
69 71
70 spin_lock_irqsave(&stopper->lock, flags); 72 spin_lock_irqsave(&stopper->lock, flags);
71 73
72 if (stopper->enabled) { 74 if (stopper->enabled) {
73 list_add_tail(&work->list, &stopper->works); 75 list_add_tail(&work->list, &stopper->works);
74 wake_up_process(stopper->thread); 76 wake_up_process(p);
75 } else 77 } else
76 cpu_stop_signal_done(work->done, false); 78 cpu_stop_signal_done(work->done, false);
77 79
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
108 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 110 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
109 111
110 cpu_stop_init_done(&done, 1); 112 cpu_stop_init_done(&done, 1);
111 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); 113 cpu_stop_queue_work(cpu, &work);
112 wait_for_completion(&done.completion); 114 wait_for_completion(&done.completion);
113 return done.executed ? done.ret : -ENOENT; 115 return done.executed ? done.ret : -ENOENT;
114} 116}
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
130 struct cpu_stop_work *work_buf) 132 struct cpu_stop_work *work_buf)
131{ 133{
132 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 134 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
133 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); 135 cpu_stop_queue_work(cpu, work_buf);
134} 136}
135 137
136/* static data for stop_cpus */ 138/* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 */ 161 */
160 preempt_disable(); 162 preempt_disable();
161 for_each_cpu(cpu, cpumask) 163 for_each_cpu(cpu, cpumask)
162 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
163 &per_cpu(stop_cpus_work, cpu));
164 preempt_enable(); 165 preempt_enable();
165} 166}
166 167
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
244 return ret; 245 return ret;
245} 246}
246 247
247static int cpu_stopper_thread(void *data) 248static int cpu_stop_should_run(unsigned int cpu)
249{
250 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
251 unsigned long flags;
252 int run;
253
254 spin_lock_irqsave(&stopper->lock, flags);
255 run = !list_empty(&stopper->works);
256 spin_unlock_irqrestore(&stopper->lock, flags);
257 return run;
258}
259
260static void cpu_stopper_thread(unsigned int cpu)
248{ 261{
249 struct cpu_stopper *stopper = data; 262 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
250 struct cpu_stop_work *work; 263 struct cpu_stop_work *work;
251 int ret; 264 int ret;
252 265
253repeat: 266repeat:
254 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
255
256 if (kthread_should_stop()) {
257 __set_current_state(TASK_RUNNING);
258 return 0;
259 }
260
261 work = NULL; 267 work = NULL;
262 spin_lock_irq(&stopper->lock); 268 spin_lock_irq(&stopper->lock);
263 if (!list_empty(&stopper->works)) { 269 if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
273 struct cpu_stop_done *done = work->done; 279 struct cpu_stop_done *done = work->done;
274 char ksym_buf[KSYM_NAME_LEN] __maybe_unused; 280 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
275 281
276 __set_current_state(TASK_RUNNING);
277
278 /* cpu stop callbacks are not allowed to sleep */ 282 /* cpu stop callbacks are not allowed to sleep */
279 preempt_disable(); 283 preempt_disable();
280 284
@@ -290,88 +294,55 @@ repeat:
290 ksym_buf), arg); 294 ksym_buf), arg);
291 295
292 cpu_stop_signal_done(done, true); 296 cpu_stop_signal_done(done, true);
293 } else 297 goto repeat;
294 schedule(); 298 }
295
296 goto repeat;
297} 299}
298 300
299extern void sched_set_stop_task(int cpu, struct task_struct *stop); 301extern void sched_set_stop_task(int cpu, struct task_struct *stop);
300 302
301/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 303static void cpu_stop_create(unsigned int cpu)
302static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 304{
303 unsigned long action, void *hcpu) 305 sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
306}
307
308static void cpu_stop_park(unsigned int cpu)
304{ 309{
305 unsigned int cpu = (unsigned long)hcpu;
306 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 310 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
307 struct task_struct *p; 311 struct cpu_stop_work *work;
308 312 unsigned long flags;
309 switch (action & ~CPU_TASKS_FROZEN) {
310 case CPU_UP_PREPARE:
311 BUG_ON(stopper->thread || stopper->enabled ||
312 !list_empty(&stopper->works));
313 p = kthread_create_on_node(cpu_stopper_thread,
314 stopper,
315 cpu_to_node(cpu),
316 "migration/%d", cpu);
317 if (IS_ERR(p))
318 return notifier_from_errno(PTR_ERR(p));
319 get_task_struct(p);
320 kthread_bind(p, cpu);
321 sched_set_stop_task(cpu, p);
322 stopper->thread = p;
323 break;
324
325 case CPU_ONLINE:
326 /* strictly unnecessary, as first user will wake it */
327 wake_up_process(stopper->thread);
328 /* mark enabled */
329 spin_lock_irq(&stopper->lock);
330 stopper->enabled = true;
331 spin_unlock_irq(&stopper->lock);
332 break;
333
334#ifdef CONFIG_HOTPLUG_CPU
335 case CPU_UP_CANCELED:
336 case CPU_POST_DEAD:
337 {
338 struct cpu_stop_work *work;
339
340 sched_set_stop_task(cpu, NULL);
341 /* kill the stopper */
342 kthread_stop(stopper->thread);
343 /* drain remaining works */
344 spin_lock_irq(&stopper->lock);
345 list_for_each_entry(work, &stopper->works, list)
346 cpu_stop_signal_done(work->done, false);
347 stopper->enabled = false;
348 spin_unlock_irq(&stopper->lock);
349 /* release the stopper */
350 put_task_struct(stopper->thread);
351 stopper->thread = NULL;
352 break;
353 }
354#endif
355 }
356 313
357 return NOTIFY_OK; 314 /* drain remaining works */
315 spin_lock_irqsave(&stopper->lock, flags);
316 list_for_each_entry(work, &stopper->works, list)
317 cpu_stop_signal_done(work->done, false);
318 stopper->enabled = false;
319 spin_unlock_irqrestore(&stopper->lock, flags);
358} 320}
359 321
360/* 322static void cpu_stop_unpark(unsigned int cpu)
361 * Give it a higher priority so that cpu stopper is available to other 323{
362 * cpu notifiers. It currently shares the same priority as sched 324 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
363 * migration_notifier. 325
364 */ 326 spin_lock_irq(&stopper->lock);
365static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { 327 stopper->enabled = true;
366 .notifier_call = cpu_stop_cpu_callback, 328 spin_unlock_irq(&stopper->lock);
367 .priority = 10, 329}
330
331static struct smp_hotplug_thread cpu_stop_threads = {
332 .store = &cpu_stopper_task,
333 .thread_should_run = cpu_stop_should_run,
334 .thread_fn = cpu_stopper_thread,
335 .thread_comm = "migration/%u",
336 .create = cpu_stop_create,
337 .setup = cpu_stop_unpark,
338 .park = cpu_stop_park,
339 .unpark = cpu_stop_unpark,
340 .selfparking = true,
368}; 341};
369 342
370static int __init cpu_stop_init(void) 343static int __init cpu_stop_init(void)
371{ 344{
372 void *bcpu = (void *)(long)smp_processor_id();
373 unsigned int cpu; 345 unsigned int cpu;
374 int err;
375 346
376 for_each_possible_cpu(cpu) { 347 for_each_possible_cpu(cpu) {
377 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 348 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
380 INIT_LIST_HEAD(&stopper->works); 351 INIT_LIST_HEAD(&stopper->works);
381 } 352 }
382 353
383 /* start one for the boot cpu */ 354 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
384 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
385 bcpu);
386 BUG_ON(err != NOTIFY_OK);
387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
388 register_cpu_notifier(&cpu_stop_cpu_notifier);
389
390 stop_machine_initialized = true; 355 stop_machine_initialized = true;
391
392 return 0; 356 return 0;
393} 357}
394early_initcall(cpu_stop_init); 358early_initcall(cpu_stop_init);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878db491e..4fc9be955c71 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
61#include <linux/kmod.h> 61#include <linux/kmod.h>
62#include <linux/capability.h> 62#include <linux/capability.h>
63#include <linux/binfmts.h> 63#include <linux/binfmts.h>
64#include <linux/sched/sysctl.h>
64 65
65#include <asm/uaccess.h> 66#include <asm/uaccess.h>
66#include <asm/processor.h> 67#include <asm/processor.h>
@@ -403,6 +404,13 @@ static struct ctl_table kern_table[] = {
403 .mode = 0644, 404 .mode = 0644,
404 .proc_handler = sched_rt_handler, 405 .proc_handler = sched_rt_handler,
405 }, 406 },
407 {
408 .procname = "sched_rr_timeslice_ms",
409 .data = &sched_rr_timeslice,
410 .maxlen = sizeof(int),
411 .mode = 0644,
412 .proc_handler = sched_rr_handler,
413 },
406#ifdef CONFIG_SCHED_AUTOGROUP 414#ifdef CONFIG_SCHED_AUTOGROUP
407 { 415 {
408 .procname = "sched_autogroup_enabled", 416 .procname = "sched_autogroup_enabled",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a6384450501..b669ca1fa103 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, 387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, 388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, 389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..c2a27dd93142 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
115} 115}
116 116
117/* 117/*
118 * Indicates if there is an offset between the system clock and the hardware
119 * clock/persistent clock/rtc.
120 */
121int persistent_clock_is_local;
122
123/*
118 * Adjust the time obtained from the CMOS to be UTC time instead of 124 * Adjust the time obtained from the CMOS to be UTC time instead of
119 * local time. 125 * local time.
120 * 126 *
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
135 struct timespec adjust; 141 struct timespec adjust;
136 142
137 adjust = current_kernel_time(); 143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1;
138 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 146 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 do_settimeofday(&adjust); 147 do_settimeofday(&adjust);
140} 148}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Platforms has a persistent clock
16config ALWAYS_USE_PERSISTENT_CLOCK
17 bool
18 default n
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
38 default y 43 default y
39 depends on GENERIC_CLOCKEVENTS 44 depends on GENERIC_CLOCKEVENTS
40 45
46# Architecture can handle broadcast in a driver-agnostic way
47config ARCH_HAS_TICK_BROADCAST
48 bool
49
41# Clockevents broadcasting infrastructure 50# Clockevents broadcasting infrastructure
42config GENERIC_CLOCKEVENTS_BROADCAST 51config GENERIC_CLOCKEVENTS_BROADCAST
43 bool 52 bool
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..b10a42bb0165 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rtc.h>
18 19
19#include "tick-internal.h" 20#include "tick-internal.h"
20 21
@@ -483,8 +484,7 @@ out:
483 return leap; 484 return leap;
484} 485}
485 486
486#ifdef CONFIG_GENERIC_CMOS_UPDATE 487#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
487
488static void sync_cmos_clock(struct work_struct *work); 488static void sync_cmos_clock(struct work_struct *work);
489 489
490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
510 } 510 }
511 511
512 getnstimeofday(&now); 512 getnstimeofday(&now);
513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
514 fail = update_persistent_clock(now); 514 struct timespec adjust = now;
515
516 fail = -ENODEV;
517 if (persistent_clock_is_local)
518 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
519#ifdef CONFIG_GENERIC_CMOS_UPDATE
520 fail = update_persistent_clock(adjust);
521#endif
522#ifdef CONFIG_RTC_SYSTOHC
523 if (fail == -ENODEV)
524 fail = rtc_set_ntp_time(adjust);
525#endif
526 }
515 527
516 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); 528 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
517 if (next.tv_nsec <= 0) 529 if (next.tv_nsec <= 0)
518 next.tv_nsec += NSEC_PER_SEC; 530 next.tv_nsec += NSEC_PER_SEC;
519 531
520 if (!fail) 532 if (!fail || fail == -ENODEV)
521 next.tv_sec = 659; 533 next.tv_sec = 659;
522 else 534 else
523 next.tv_sec = 0; 535 next.tv_sec = 0;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..2fb8cb88df8d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h>
21 22
22#include "tick-internal.h" 23#include "tick-internal.h"
23 24
@@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
86 return (dev && tick_broadcast_device.evtdev == dev); 87 return (dev && tick_broadcast_device.evtdev == dev);
87} 88}
88 89
90static void err_broadcast(const struct cpumask *mask)
91{
92 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
93}
94
95static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
96{
97 if (!dev->broadcast)
98 dev->broadcast = tick_broadcast;
99 if (!dev->broadcast) {
100 pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
101 dev->name);
102 dev->broadcast = err_broadcast;
103 }
104}
105
89/* 106/*
90 * Check, if the device is disfunctional and a place holder, which 107 * Check, if the device is disfunctional and a place holder, which
91 * needs to be handled by the broadcast device. 108 * needs to be handled by the broadcast device.
@@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
105 */ 122 */
106 if (!tick_device_is_functional(dev)) { 123 if (!tick_device_is_functional(dev)) {
107 dev->event_handler = tick_handle_periodic; 124 dev->event_handler = tick_handle_periodic;
125 tick_device_setup_broadcast_func(dev);
108 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 126 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
109 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
110 ret = 1; 128 ret = 1;
@@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 */ 134 */
117 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
118 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
119
120 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
121 tick_broadcast_clear_oneshot(cpu); 138 tick_broadcast_clear_oneshot(cpu);
139 } else {
140 tick_device_setup_broadcast_func(dev);
122 } 141 }
123 } 142 }
124 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 143 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
125 return ret; 144 return ret;
126} 145}
127 146
147#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
148int tick_receive_broadcast(void)
149{
150 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
151 struct clock_event_device *evt = td->evtdev;
152
153 if (!evt)
154 return -ENODEV;
155
156 if (!evt->event_handler)
157 return -EINVAL;
158
159 evt->event_handler(evt);
160 return 0;
161}
162#endif
163
128/* 164/*
129 * Broadcast the event to the cpus, which are set in the mask (mangled). 165 * Broadcast the event to the cpus, which are set in the mask (mangled).
130 */ 166 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd1..314b9ee07edf 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -28,7 +29,7 @@
28/* 29/*
29 * Per cpu nohz control structure 30 * Per cpu nohz control structure
30 */ 31 */
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 32DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 33
33/* 34/*
34 * The time, when the last jiffy update happened. Protected by jiffies_lock. 35 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
331 time_delta = timekeeping_max_deferment(); 332 time_delta = timekeeping_max_deferment();
332 } while (read_seqretry(&jiffies_lock, seq)); 333 } while (read_seqretry(&jiffies_lock, seq));
333 334
334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 335 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
335 arch_needs_cpu(cpu)) { 336 arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
336 next_jiffies = last_jiffies + 1; 337 next_jiffies = last_jiffies + 1;
337 delta_jiffies = 1; 338 delta_jiffies = 1;
338 } else { 339 } else {
@@ -631,8 +632,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
631 632
632static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 633static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
633{ 634{
634#ifndef CONFIG_VIRT_CPU_ACCOUNTING 635#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
635 unsigned long ticks; 636 unsigned long ticks;
637
638 if (vtime_accounting_enabled())
639 return;
636 /* 640 /*
637 * We stopped the tick in idle. Update process times would miss the 641 * We stopped the tick in idle. Update process times would miss the
638 * time we slept as update_process_times does only a 1 tick 642 * time we slept as update_process_times does only a 1 tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3f..1e35515a875e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -29,6 +29,9 @@ static struct timekeeper timekeeper;
29/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
31 31
32/* Flag for if there is a persistent clock on this platform */
33bool __read_mostly persistent_clock_exist = false;
34
32static inline void tk_normalize_xtime(struct timekeeper *tk) 35static inline void tk_normalize_xtime(struct timekeeper *tk)
33{ 36{
34 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 37 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -264,19 +267,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
264} 267}
265 268
266/** 269/**
267 * getnstimeofday - Returns the time of day in a timespec 270 * __getnstimeofday - Returns the time of day in a timespec.
268 * @ts: pointer to the timespec to be set 271 * @ts: pointer to the timespec to be set
269 * 272 *
270 * Returns the time of day in a timespec. 273 * Updates the time of day in the timespec.
274 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
271 */ 275 */
272void getnstimeofday(struct timespec *ts) 276int __getnstimeofday(struct timespec *ts)
273{ 277{
274 struct timekeeper *tk = &timekeeper; 278 struct timekeeper *tk = &timekeeper;
275 unsigned long seq; 279 unsigned long seq;
276 s64 nsecs = 0; 280 s64 nsecs = 0;
277 281
278 WARN_ON(timekeeping_suspended);
279
280 do { 282 do {
281 seq = read_seqbegin(&tk->lock); 283 seq = read_seqbegin(&tk->lock);
282 284
@@ -287,6 +289,26 @@ void getnstimeofday(struct timespec *ts)
287 289
288 ts->tv_nsec = 0; 290 ts->tv_nsec = 0;
289 timespec_add_ns(ts, nsecs); 291 timespec_add_ns(ts, nsecs);
292
293 /*
294 * Do not bail out early, in case there were callers still using
295 * the value, even in the face of the WARN_ON.
296 */
297 if (unlikely(timekeeping_suspended))
298 return -EAGAIN;
299 return 0;
300}
301EXPORT_SYMBOL(__getnstimeofday);
302
303/**
304 * getnstimeofday - Returns the time of day in a timespec.
305 * @ts: pointer to the timespec to be set
306 *
307 * Returns the time of day in a timespec (WARN if suspended).
308 */
309void getnstimeofday(struct timespec *ts)
310{
311 WARN_ON(__getnstimeofday(ts));
290} 312}
291EXPORT_SYMBOL(getnstimeofday); 313EXPORT_SYMBOL(getnstimeofday);
292 314
@@ -640,12 +662,14 @@ void __init timekeeping_init(void)
640 struct timespec now, boot, tmp; 662 struct timespec now, boot, tmp;
641 663
642 read_persistent_clock(&now); 664 read_persistent_clock(&now);
665
643 if (!timespec_valid_strict(&now)) { 666 if (!timespec_valid_strict(&now)) {
644 pr_warn("WARNING: Persistent clock returned invalid value!\n" 667 pr_warn("WARNING: Persistent clock returned invalid value!\n"
645 " Check your CMOS/BIOS settings.\n"); 668 " Check your CMOS/BIOS settings.\n");
646 now.tv_sec = 0; 669 now.tv_sec = 0;
647 now.tv_nsec = 0; 670 now.tv_nsec = 0;
648 } 671 } else if (now.tv_sec || now.tv_nsec)
672 persistent_clock_exist = true;
649 673
650 read_boot_clock(&boot); 674 read_boot_clock(&boot);
651 if (!timespec_valid_strict(&boot)) { 675 if (!timespec_valid_strict(&boot)) {
@@ -718,11 +742,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
718{ 742{
719 struct timekeeper *tk = &timekeeper; 743 struct timekeeper *tk = &timekeeper;
720 unsigned long flags; 744 unsigned long flags;
721 struct timespec ts;
722 745
723 /* Make sure we don't set the clock twice */ 746 /*
724 read_persistent_clock(&ts); 747 * Make sure we don't set the clock twice, as timekeeping_resume()
725 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 748 * already did it
749 */
750 if (has_persistent_clock())
726 return; 751 return;
727 752
728 write_seqlock_irqsave(&tk->lock, flags); 753 write_seqlock_irqsave(&tk->lock, flags);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index eb51d76e058a..3f42652a6a37 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -369,10 +369,8 @@ if ($hz eq '--can') {
369 die "Usage: $0 HZ\n"; 369 die "Usage: $0 HZ\n";
370 } 370 }
371 371
372 @val = @{$canned_values{$hz}}; 372 $cv = $canned_values{$hz};
373 if (!defined(@val)) { 373 @val = defined($cv) ? @$cv : compute_values($hz);
374 @val = compute_values($hz);
375 }
376 output($hz, @val); 374 output($hz, @val);
377} 375}
378exit 0; 376exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/irq_work.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43 44
44#include <asm/uaccess.h> 45#include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
1351 account_process_tick(p, user_tick); 1352 account_process_tick(p, user_tick);
1352 run_local_timers(); 1353 run_local_timers();
1353 rcu_check_callbacks(cpu, user_tick); 1354 rcu_check_callbacks(cpu, user_tick);
1354 printk_tick();
1355#ifdef CONFIG_IRQ_WORK 1355#ifdef CONFIG_IRQ_WORK
1356 if (in_irq()) 1356 if (in_irq())
1357 irq_work_run(); 1357 irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485f..192473b22799 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
39 help 39 help
40 See Documentation/trace/ftrace-design.txt 40 See Documentation/trace/ftrace-design.txt
41 41
42config HAVE_DYNAMIC_FTRACE_WITH_REGS
43 bool
44
42config HAVE_FTRACE_MCOUNT_RECORD 45config HAVE_FTRACE_MCOUNT_RECORD
43 bool 46 bool
44 help 47 help
@@ -78,21 +81,6 @@ config EVENT_TRACING
78 select CONTEXT_SWITCH_TRACER 81 select CONTEXT_SWITCH_TRACER
79 bool 82 bool
80 83
81config EVENT_POWER_TRACING_DEPRECATED
82 depends on EVENT_TRACING
83 bool "Deprecated power event trace API, to be removed"
84 default y
85 help
86 Provides old power event types:
87 C-state/idle accounting events:
88 power:power_start
89 power:power_end
90 and old cpufreq accounting event:
91 power:power_frequency
92 This is for userspace compatibility
93 and will vanish after 5 kernel iterations,
94 namely 3.1.
95
96config CONTEXT_SWITCH_TRACER 84config CONTEXT_SWITCH_TRACER
97 bool 85 bool
98 86
@@ -250,6 +238,16 @@ config FTRACE_SYSCALLS
250 help 238 help
251 Basic tracer to catch the syscall entry and exit events. 239 Basic tracer to catch the syscall entry and exit events.
252 240
241config TRACER_SNAPSHOT
242 bool "Create a snapshot trace buffer"
243 select TRACER_MAX_TRACE
244 help
245 Allow tracing users to take snapshot of the current buffer using the
246 ftrace interface, e.g.:
247
248 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot
250
253config TRACE_BRANCH_PROFILING 251config TRACE_BRANCH_PROFILING
254 bool 252 bool
255 select GENERIC_TRACER 253 select GENERIC_TRACER
@@ -434,6 +432,11 @@ config DYNAMIC_FTRACE
434 were made. If so, it runs stop_machine (stops all CPUS) 432 were made. If so, it runs stop_machine (stops all CPUS)
435 and modifies the code to jump over the call to ftrace. 433 and modifies the code to jump over the call to ftrace.
436 434
435config DYNAMIC_FTRACE_WITH_REGS
436 def_bool y
437 depends on DYNAMIC_FTRACE
438 depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
439
437config FUNCTION_PROFILER 440config FUNCTION_PROFILER
438 bool "Kernel function profiler" 441 bool "Kernel function profiler"
439 depends on FUNCTION_TRACER 442 depends on FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..71259e2b6b61 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
147 return; 147 return;
148 148
149 local_irq_save(flags); 149 local_irq_save(flags);
150 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 150 buf = this_cpu_ptr(bt->msg_data);
151 va_start(args, fmt); 151 va_start(args, fmt);
152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); 152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
153 va_end(args); 153 va_end(args);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 41473b4ad7a4..ce8c3d68292f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) 111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif 112#endif
113 113
114/*
115 * Traverse the ftrace_global_list, invoking all entries. The reason that we
116 * can use rcu_dereference_raw() is that elements removed from this list
117 * are simply leaked, so there is no need to interact with a grace-period
118 * mechanism. The rcu_dereference_raw() calls are needed to handle
119 * concurrent insertions into the ftrace_global_list.
120 *
121 * Silly Alpha and silly pointer-speculation compiler optimizations!
122 */
123#define do_for_each_ftrace_op(op, list) \
124 op = rcu_dereference_raw(list); \
125 do
126
127/*
128 * Optimized for just a single item in the list (as that is the normal case).
129 */
130#define while_for_each_ftrace_op(op) \
131 while (likely(op = rcu_dereference_raw((op)->next)) && \
132 unlikely((op) != &ftrace_list_end))
133
114/** 134/**
115 * ftrace_nr_registered_ops - return number of ops registered 135 * ftrace_nr_registered_ops - return number of ops registered
116 * 136 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
132 return cnt; 152 return cnt;
133} 153}
134 154
135/*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
137 * can use rcu_dereference_raw() is that elements removed from this list
138 * are simply leaked, so there is no need to interact with a grace-period
139 * mechanism. The rcu_dereference_raw() calls are needed to handle
140 * concurrent insertions into the ftrace_global_list.
141 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */
144static void 155static void
145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 156ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs) 157 struct ftrace_ops *op, struct pt_regs *regs)
147{ 158{
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 159 int bit;
160
161 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
162 if (bit < 0)
149 return; 163 return;
150 164
151 trace_recursion_set(TRACE_GLOBAL_BIT); 165 do_for_each_ftrace_op(op, ftrace_global_list) {
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 166 op->func(ip, parent_ip, op, regs);
155 op = rcu_dereference_raw(op->next); /*see above*/ 167 } while_for_each_ftrace_op(op);
156 }; 168
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 169 trace_clear_recursion(bit);
158} 170}
159 171
160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 172static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
221 * registered callers. 233 * registered callers.
222 */ 234 */
223 if (ftrace_global_list == &ftrace_list_end || 235 if (ftrace_global_list == &ftrace_list_end ||
224 ftrace_global_list->next == &ftrace_list_end) 236 ftrace_global_list->next == &ftrace_list_end) {
225 func = ftrace_global_list->func; 237 func = ftrace_global_list->func;
226 else 238 /*
239 * As we are calling the function directly.
240 * If it does not have recursion protection,
241 * the function_trace_op needs to be updated
242 * accordingly.
243 */
244 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
245 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
246 else
247 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
248 } else {
227 func = ftrace_global_list_func; 249 func = ftrace_global_list_func;
250 /* The list has its own recursion protection. */
251 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
252 }
253
228 254
229 /* If we filter on pids, update to use the pid function */ 255 /* If we filter on pids, update to use the pid function */
230 if (!list_empty(&ftrace_pids)) { 256 if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 363 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL; 364 return -EINVAL;
339 365
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS 366#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
341 /* 367 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 368 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. 369 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -4090,14 +4116,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4090 */ 4116 */
4091 preempt_disable_notrace(); 4117 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT); 4118 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list); 4119 do_for_each_ftrace_op(op, ftrace_control_list) {
4094 while (op != &ftrace_list_end) {
4095 if (!ftrace_function_local_disabled(op) && 4120 if (!ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip)) 4121 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs); 4122 op->func(ip, parent_ip, op, regs);
4098 4123 } while_for_each_ftrace_op(op);
4099 op = rcu_dereference_raw(op->next);
4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT); 4124 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace(); 4125 preempt_enable_notrace();
4103} 4126}
@@ -4112,27 +4135,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs) 4135 struct ftrace_ops *ignored, struct pt_regs *regs)
4113{ 4136{
4114 struct ftrace_ops *op; 4137 struct ftrace_ops *op;
4138 int bit;
4115 4139
4116 if (function_trace_stop) 4140 if (function_trace_stop)
4117 return; 4141 return;
4118 4142
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4143 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4144 if (bit < 0)
4120 return; 4145 return;
4121 4146
4122 trace_recursion_set(TRACE_INTERNAL_BIT);
4123 /* 4147 /*
4124 * Some of the ops may be dynamically allocated, 4148 * Some of the ops may be dynamically allocated,
4125 * they must be freed after a synchronize_sched(). 4149 * they must be freed after a synchronize_sched().
4126 */ 4150 */
4127 preempt_disable_notrace(); 4151 preempt_disable_notrace();
4128 op = rcu_dereference_raw(ftrace_ops_list); 4152 do_for_each_ftrace_op(op, ftrace_ops_list) {
4129 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 4153 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 4154 op->func(ip, parent_ip, op, regs);
4132 op = rcu_dereference_raw(op->next); 4155 } while_for_each_ftrace_op(op);
4133 };
4134 preempt_enable_notrace(); 4156 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 4157 trace_clear_recursion(bit);
4136} 4158}
4137 4159
4138/* 4160/*
@@ -4143,8 +4165,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4143 * Archs are to support both the regs and ftrace_ops at the same time. 4165 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs. 4166 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs 4167 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. 4168 * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. 4169 * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still 4170 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS. 4171 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */ 4172 */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
20 17
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..7244acde77b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
3 * 3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ftrace_event.h>
6#include <linux/ring_buffer.h> 7#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/debugfs.h> 11#include <linux/debugfs.h>
10#include <linux/uaccess.h> 12#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
21#include <linux/fs.h> 23#include <linux/fs.h>
22 24
23#include <asm/local.h> 25#include <asm/local.h>
24#include "trace.h"
25 26
26static void update_pages_handler(struct work_struct *work); 27static void update_pages_handler(struct work_struct *work);
27 28
@@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2432 2433
2433#ifdef CONFIG_TRACING 2434#ifdef CONFIG_TRACING
2434 2435
2435#define TRACE_RECURSIVE_DEPTH 16 2436/*
2437 * The lock and unlock are done within a preempt disable section.
2438 * The current_context per_cpu variable can only be modified
2439 * by the current task between lock and unlock. But it can
2440 * be modified more than once via an interrupt. To pass this
2441 * information from the lock to the unlock without having to
2442 * access the 'in_interrupt()' functions again (which do show
2443 * a bit of overhead in something as critical as function tracing,
2444 * we use a bitmask trick.
2445 *
2446 * bit 0 = NMI context
2447 * bit 1 = IRQ context
2448 * bit 2 = SoftIRQ context
2449 * bit 3 = normal context.
2450 *
2451 * This works because this is the order of contexts that can
2452 * preempt other contexts. A SoftIRQ never preempts an IRQ
2453 * context.
2454 *
2455 * When the context is determined, the corresponding bit is
2456 * checked and set (if it was set, then a recursion of that context
2457 * happened).
2458 *
2459 * On unlock, we need to clear this bit. To do so, just subtract
2460 * 1 from the current_context and AND it to itself.
2461 *
2462 * (binary)
2463 * 101 - 1 = 100
2464 * 101 & 100 = 100 (clearing bit zero)
2465 *
2466 * 1010 - 1 = 1001
2467 * 1010 & 1001 = 1000 (clearing bit 1)
2468 *
2469 * The least significant bit can be cleared this way, and it
2470 * just so happens that it is the same bit corresponding to
2471 * the current context.
2472 */
2473static DEFINE_PER_CPU(unsigned int, current_context);
2436 2474
2437/* Keep this code out of the fast path cache */ 2475static __always_inline int trace_recursive_lock(void)
2438static noinline void trace_recursive_fail(void)
2439{ 2476{
2440 /* Disable all tracing before we do anything else */ 2477 unsigned int val = this_cpu_read(current_context);
2441 tracing_off_permanent(); 2478 int bit;
2442
2443 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2444 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2445 trace_recursion_buffer(),
2446 hardirq_count() >> HARDIRQ_SHIFT,
2447 softirq_count() >> SOFTIRQ_SHIFT,
2448 in_nmi());
2449
2450 WARN_ON_ONCE(1);
2451}
2452 2479
2453static inline int trace_recursive_lock(void) 2480 if (in_interrupt()) {
2454{ 2481 if (in_nmi())
2455 trace_recursion_inc(); 2482 bit = 0;
2483 else if (in_irq())
2484 bit = 1;
2485 else
2486 bit = 2;
2487 } else
2488 bit = 3;
2456 2489
2457 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) 2490 if (unlikely(val & (1 << bit)))
2458 return 0; 2491 return 1;
2459 2492
2460 trace_recursive_fail(); 2493 val |= (1 << bit);
2494 this_cpu_write(current_context, val);
2461 2495
2462 return -1; 2496 return 0;
2463} 2497}
2464 2498
2465static inline void trace_recursive_unlock(void) 2499static __always_inline void trace_recursive_unlock(void)
2466{ 2500{
2467 WARN_ON_ONCE(!trace_recursion_buffer()); 2501 unsigned int val = this_cpu_read(current_context);
2468 2502
2469 trace_recursion_dec(); 2503 val--;
2504 val &= this_cpu_read(current_context);
2505 this_cpu_write(current_context, val);
2470} 2506}
2471 2507
2472#else 2508#else
@@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); 3103EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068 3104
3069/** 3105/**
3106 * ring_buffer_read_events_cpu - get the number of events successfully read
3107 * @buffer: The ring buffer
3108 * @cpu: The per CPU buffer to get the number of events read
3109 */
3110unsigned long
3111ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
3112{
3113 struct ring_buffer_per_cpu *cpu_buffer;
3114
3115 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3116 return 0;
3117
3118 cpu_buffer = buffer->buffers[cpu];
3119 return cpu_buffer->read;
3120}
3121EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
3122
3123/**
3070 * ring_buffer_entries - get the number of entries in a buffer 3124 * ring_buffer_entries - get the number of entries in a buffer
3071 * @buffer: The ring buffer 3125 * @buffer: The ring buffer
3072 * 3126 *
@@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3425 /* check for end of page padding */ 3479 /* check for end of page padding */
3426 if ((iter->head >= rb_page_size(iter->head_page)) && 3480 if ((iter->head >= rb_page_size(iter->head_page)) &&
3427 (iter->head_page != cpu_buffer->commit_page)) 3481 (iter->head_page != cpu_buffer->commit_page))
3428 rb_advance_iter(iter); 3482 rb_inc_iter(iter);
3429} 3483}
3430 3484
3431static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) 3485static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d24..c2e2c2310374 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
39#include <linux/poll.h> 39#include <linux/poll.h>
40#include <linux/nmi.h> 40#include <linux/nmi.h>
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/sched/rt.h>
42 43
43#include "trace.h" 44#include "trace.h"
44#include "trace_output.h" 45#include "trace_output.h"
@@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249static struct tracer *trace_types __read_mostly; 250static struct tracer *trace_types __read_mostly;
250 251
251/* current_trace points to the tracer that is currently active */ 252/* current_trace points to the tracer that is currently active */
252static struct tracer *current_trace __read_mostly; 253static struct tracer *current_trace __read_mostly = &nop_trace;
253 254
254/* 255/*
255 * trace_types_lock is used to protect the trace_types list. 256 * trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
709 return; 710 return;
710 711
711 WARN_ON_ONCE(!irqs_disabled()); 712 WARN_ON_ONCE(!irqs_disabled());
712 if (!current_trace->use_max_tr) { 713
713 WARN_ON_ONCE(1); 714 if (!current_trace->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace);
714 return; 717 return;
715 } 718 }
719
716 arch_spin_lock(&ftrace_max_lock); 720 arch_spin_lock(&ftrace_max_lock);
717 721
718 tr->buffer = max_tr.buffer; 722 tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
739 return; 743 return;
740 744
741 WARN_ON_ONCE(!irqs_disabled()); 745 WARN_ON_ONCE(!irqs_disabled());
742 if (!current_trace->use_max_tr) { 746 if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
743 WARN_ON_ONCE(1);
744 return; 747 return;
745 }
746 748
747 arch_spin_lock(&ftrace_max_lock); 749 arch_spin_lock(&ftrace_max_lock);
748 750
@@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)
862 864
863 current_trace = type; 865 current_trace = type;
864 866
865 /* If we expanded the buffers, make sure the max is expanded too */ 867 if (type->use_max_tr) {
866 if (ring_buffer_expanded && type->use_max_tr) 868 /* If we expanded the buffers, make sure the max is expanded too */
867 ring_buffer_resize(max_tr.buffer, trace_buf_size, 869 if (ring_buffer_expanded)
868 RING_BUFFER_ALL_CPUS); 870 ring_buffer_resize(max_tr.buffer, trace_buf_size,
871 RING_BUFFER_ALL_CPUS);
872 type->allocated_snapshot = true;
873 }
869 874
870 /* the test is responsible for initializing and enabling */ 875 /* the test is responsible for initializing and enabling */
871 pr_info("Testing tracer %s: ", type->name); 876 pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)
881 /* Only reset on passing, to avoid touching corrupted buffers */ 886 /* Only reset on passing, to avoid touching corrupted buffers */
882 tracing_reset_online_cpus(tr); 887 tracing_reset_online_cpus(tr);
883 888
884 /* Shrink the max buffer again */ 889 if (type->use_max_tr) {
885 if (ring_buffer_expanded && type->use_max_tr) 890 type->allocated_snapshot = false;
886 ring_buffer_resize(max_tr.buffer, 1, 891
887 RING_BUFFER_ALL_CPUS); 892 /* Shrink the max buffer again */
893 if (ring_buffer_expanded)
894 ring_buffer_resize(max_tr.buffer, 1,
895 RING_BUFFER_ALL_CPUS);
896 }
888 897
889 printk(KERN_CONT "PASSED\n"); 898 printk(KERN_CONT "PASSED\n");
890 } 899 }
@@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
922{ 931{
923 struct ring_buffer *buffer = tr->buffer; 932 struct ring_buffer *buffer = tr->buffer;
924 933
934 if (!buffer)
935 return;
936
925 ring_buffer_record_disable(buffer); 937 ring_buffer_record_disable(buffer);
926 938
927 /* Make sure all commits have finished */ 939 /* Make sure all commits have finished */
@@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
936 struct ring_buffer *buffer = tr->buffer; 948 struct ring_buffer *buffer = tr->buffer;
937 int cpu; 949 int cpu;
938 950
951 if (!buffer)
952 return;
953
939 ring_buffer_record_disable(buffer); 954 ring_buffer_record_disable(buffer);
940 955
941 /* Make sure all commits have finished */ 956 /* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1167 1182
1168 entry->preempt_count = pc & 0xff; 1183 entry->preempt_count = pc & 0xff;
1169 entry->pid = (tsk) ? tsk->pid : 0; 1184 entry->pid = (tsk) ? tsk->pid : 0;
1170 entry->padding = 0;
1171 entry->flags = 1185 entry->flags =
1172#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1186#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1173 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1187 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1335 */ 1349 */
1336 preempt_disable_notrace(); 1350 preempt_disable_notrace();
1337 1351
1338 use_stack = ++__get_cpu_var(ftrace_stack_reserve); 1352 use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
1339 /* 1353 /*
1340 * We don't need any atomic variables, just a barrier. 1354 * We don't need any atomic variables, just a barrier.
1341 * If an interrupt comes in, we don't care, because it would 1355 * If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1389 out: 1403 out:
1390 /* Again, don't let gcc optimize things here */ 1404 /* Again, don't let gcc optimize things here */
1391 barrier(); 1405 barrier();
1392 __get_cpu_var(ftrace_stack_reserve)--; 1406 __this_cpu_dec(ftrace_stack_reserve);
1393 preempt_enable_notrace(); 1407 preempt_enable_notrace();
1394 1408
1395} 1409}
@@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1517static char *get_trace_buf(void) 1531static char *get_trace_buf(void)
1518{ 1532{
1519 struct trace_buffer_struct *percpu_buffer; 1533 struct trace_buffer_struct *percpu_buffer;
1520 struct trace_buffer_struct *buffer;
1521 1534
1522 /* 1535 /*
1523 * If we have allocated per cpu buffers, then we do not 1536 * If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)
1535 if (!percpu_buffer) 1548 if (!percpu_buffer)
1536 return NULL; 1549 return NULL;
1537 1550
1538 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); 1551 return this_cpu_ptr(&percpu_buffer->buffer[0]);
1539
1540 return buffer->buffer;
1541} 1552}
1542 1553
1543static int alloc_percpu_trace_buffer(void) 1554static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1942static void *s_start(struct seq_file *m, loff_t *pos) 1953static void *s_start(struct seq_file *m, loff_t *pos)
1943{ 1954{
1944 struct trace_iterator *iter = m->private; 1955 struct trace_iterator *iter = m->private;
1945 static struct tracer *old_tracer;
1946 int cpu_file = iter->cpu_file; 1956 int cpu_file = iter->cpu_file;
1947 void *p = NULL; 1957 void *p = NULL;
1948 loff_t l = 0; 1958 loff_t l = 0;
1949 int cpu; 1959 int cpu;
1950 1960
1951 /* copy the tracer to avoid using a global lock all around */ 1961 /*
1962 * copy the tracer to avoid using a global lock all around.
1963 * iter->trace is a copy of current_trace, the pointer to the
1964 * name may be used instead of a strcmp(), as iter->trace->name
1965 * will point to the same string as current_trace->name.
1966 */
1952 mutex_lock(&trace_types_lock); 1967 mutex_lock(&trace_types_lock);
1953 if (unlikely(old_tracer != current_trace && current_trace)) { 1968 if (unlikely(current_trace && iter->trace->name != current_trace->name))
1954 old_tracer = current_trace;
1955 *iter->trace = *current_trace; 1969 *iter->trace = *current_trace;
1956 }
1957 mutex_unlock(&trace_types_lock); 1970 mutex_unlock(&trace_types_lock);
1958 1971
1959 atomic_inc(&trace_record_cmdline_disabled); 1972 if (iter->snapshot && iter->trace->use_max_tr)
1973 return ERR_PTR(-EBUSY);
1974
1975 if (!iter->snapshot)
1976 atomic_inc(&trace_record_cmdline_disabled);
1960 1977
1961 if (*pos != iter->pos) { 1978 if (*pos != iter->pos) {
1962 iter->ent = NULL; 1979 iter->ent = NULL;
@@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)
1995{ 2012{
1996 struct trace_iterator *iter = m->private; 2013 struct trace_iterator *iter = m->private;
1997 2014
1998 atomic_dec(&trace_record_cmdline_disabled); 2015 if (iter->snapshot && iter->trace->use_max_tr)
2016 return;
2017
2018 if (!iter->snapshot)
2019 atomic_dec(&trace_record_cmdline_disabled);
1999 trace_access_unlock(iter->cpu_file); 2020 trace_access_unlock(iter->cpu_file);
2000 trace_event_read_unlock(); 2021 trace_event_read_unlock();
2001} 2022}
@@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2080 unsigned long total; 2101 unsigned long total;
2081 const char *name = "preemption"; 2102 const char *name = "preemption";
2082 2103
2083 if (type) 2104 name = type->name;
2084 name = type->name;
2085 2105
2086 get_total_entries(tr, &total, &entries); 2106 get_total_entries(tr, &total, &entries);
2087 2107
@@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = {
2430}; 2450};
2431 2451
2432static struct trace_iterator * 2452static struct trace_iterator *
2433__tracing_open(struct inode *inode, struct file *file) 2453__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2434{ 2454{
2435 long cpu_file = (long) inode->i_private; 2455 long cpu_file = (long) inode->i_private;
2436 struct trace_iterator *iter; 2456 struct trace_iterator *iter;
@@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file)
2457 if (!iter->trace) 2477 if (!iter->trace)
2458 goto fail; 2478 goto fail;
2459 2479
2460 if (current_trace) 2480 *iter->trace = *current_trace;
2461 *iter->trace = *current_trace;
2462 2481
2463 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2482 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2464 goto fail; 2483 goto fail;
2465 2484
2466 if (current_trace && current_trace->print_max) 2485 if (current_trace->print_max || snapshot)
2467 iter->tr = &max_tr; 2486 iter->tr = &max_tr;
2468 else 2487 else
2469 iter->tr = &global_trace; 2488 iter->tr = &global_trace;
2489 iter->snapshot = snapshot;
2470 iter->pos = -1; 2490 iter->pos = -1;
2471 mutex_init(&iter->mutex); 2491 mutex_init(&iter->mutex);
2472 iter->cpu_file = cpu_file; 2492 iter->cpu_file = cpu_file;
@@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file)
2483 if (trace_clocks[trace_clock_id].in_ns) 2503 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2504 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485 2505
2486 /* stop the trace while dumping */ 2506 /* stop the trace while dumping if we are not opening "snapshot" */
2487 tracing_stop(); 2507 if (!iter->snapshot)
2508 tracing_stop();
2488 2509
2489 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2510 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2490 for_each_tracing_cpu(cpu) { 2511 for_each_tracing_cpu(cpu) {
@@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2547 if (iter->trace && iter->trace->close) 2568 if (iter->trace && iter->trace->close)
2548 iter->trace->close(iter); 2569 iter->trace->close(iter);
2549 2570
2550 /* reenable tracing if it was previously enabled */ 2571 if (!iter->snapshot)
2551 tracing_start(); 2572 /* reenable tracing if it was previously enabled */
2573 tracing_start();
2552 mutex_unlock(&trace_types_lock); 2574 mutex_unlock(&trace_types_lock);
2553 2575
2554 mutex_destroy(&iter->mutex); 2576 mutex_destroy(&iter->mutex);
@@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2576 } 2598 }
2577 2599
2578 if (file->f_mode & FMODE_READ) { 2600 if (file->f_mode & FMODE_READ) {
2579 iter = __tracing_open(inode, file); 2601 iter = __tracing_open(inode, file, false);
2580 if (IS_ERR(iter)) 2602 if (IS_ERR(iter))
2581 ret = PTR_ERR(iter); 2603 ret = PTR_ERR(iter);
2582 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 2604 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3014,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3014 int r; 3036 int r;
3015 3037
3016 mutex_lock(&trace_types_lock); 3038 mutex_lock(&trace_types_lock);
3017 if (current_trace) 3039 r = sprintf(buf, "%s\n", current_trace->name);
3018 r = sprintf(buf, "%s\n", current_trace->name);
3019 else
3020 r = sprintf(buf, "\n");
3021 mutex_unlock(&trace_types_lock); 3040 mutex_unlock(&trace_types_lock);
3022 3041
3023 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3042 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3183,6 +3202,7 @@ static int tracing_set_tracer(const char *buf)
3183 static struct trace_option_dentry *topts; 3202 static struct trace_option_dentry *topts;
3184 struct trace_array *tr = &global_trace; 3203 struct trace_array *tr = &global_trace;
3185 struct tracer *t; 3204 struct tracer *t;
3205 bool had_max_tr;
3186 int ret = 0; 3206 int ret = 0;
3187 3207
3188 mutex_lock(&trace_types_lock); 3208 mutex_lock(&trace_types_lock);
@@ -3207,9 +3227,21 @@ static int tracing_set_tracer(const char *buf)
3207 goto out; 3227 goto out;
3208 3228
3209 trace_branch_disable(); 3229 trace_branch_disable();
3210 if (current_trace && current_trace->reset) 3230 if (current_trace->reset)
3211 current_trace->reset(tr); 3231 current_trace->reset(tr);
3212 if (current_trace && current_trace->use_max_tr) { 3232
3233 had_max_tr = current_trace->allocated_snapshot;
3234 current_trace = &nop_trace;
3235
3236 if (had_max_tr && !t->use_max_tr) {
3237 /*
3238 * We need to make sure that the update_max_tr sees that
3239 * current_trace changed to nop_trace to keep it from
3240 * swapping the buffers after we resize it.
3241 * The update_max_tr is called from interrupts disabled
3242 * so a synchronized_sched() is sufficient.
3243 */
3244 synchronize_sched();
3213 /* 3245 /*
3214 * We don't free the ring buffer. instead, resize it because 3246 * We don't free the ring buffer. instead, resize it because
3215 * The max_tr ring buffer has some state (e.g. ring->clock) and 3247 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3217,18 +3249,19 @@ static int tracing_set_tracer(const char *buf)
3217 */ 3249 */
3218 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); 3250 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3219 set_buffer_entries(&max_tr, 1); 3251 set_buffer_entries(&max_tr, 1);
3252 tracing_reset_online_cpus(&max_tr);
3253 current_trace->allocated_snapshot = false;
3220 } 3254 }
3221 destroy_trace_option_files(topts); 3255 destroy_trace_option_files(topts);
3222 3256
3223 current_trace = &nop_trace;
3224
3225 topts = create_trace_option_files(t); 3257 topts = create_trace_option_files(t);
3226 if (t->use_max_tr) { 3258 if (t->use_max_tr && !had_max_tr) {
3227 /* we need to make per cpu buffer sizes equivalent */ 3259 /* we need to make per cpu buffer sizes equivalent */
3228 ret = resize_buffer_duplicate_size(&max_tr, &global_trace, 3260 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3229 RING_BUFFER_ALL_CPUS); 3261 RING_BUFFER_ALL_CPUS);
3230 if (ret < 0) 3262 if (ret < 0)
3231 goto out; 3263 goto out;
3264 t->allocated_snapshot = true;
3232 } 3265 }
3233 3266
3234 if (t->init) { 3267 if (t->init) {
@@ -3336,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3336 ret = -ENOMEM; 3369 ret = -ENOMEM;
3337 goto fail; 3370 goto fail;
3338 } 3371 }
3339 if (current_trace) 3372 *iter->trace = *current_trace;
3340 *iter->trace = *current_trace;
3341 3373
3342 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3374 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3343 ret = -ENOMEM; 3375 ret = -ENOMEM;
@@ -3477,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3477 size_t cnt, loff_t *ppos) 3509 size_t cnt, loff_t *ppos)
3478{ 3510{
3479 struct trace_iterator *iter = filp->private_data; 3511 struct trace_iterator *iter = filp->private_data;
3480 static struct tracer *old_tracer;
3481 ssize_t sret; 3512 ssize_t sret;
3482 3513
3483 /* return any leftover data */ 3514 /* return any leftover data */
@@ -3489,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3489 3520
3490 /* copy the tracer to avoid using a global lock all around */ 3521 /* copy the tracer to avoid using a global lock all around */
3491 mutex_lock(&trace_types_lock); 3522 mutex_lock(&trace_types_lock);
3492 if (unlikely(old_tracer != current_trace && current_trace)) { 3523 if (unlikely(iter->trace->name != current_trace->name))
3493 old_tracer = current_trace;
3494 *iter->trace = *current_trace; 3524 *iter->trace = *current_trace;
3495 }
3496 mutex_unlock(&trace_types_lock); 3525 mutex_unlock(&trace_types_lock);
3497 3526
3498 /* 3527 /*
@@ -3648,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3648 .ops = &tracing_pipe_buf_ops, 3677 .ops = &tracing_pipe_buf_ops,
3649 .spd_release = tracing_spd_release_pipe, 3678 .spd_release = tracing_spd_release_pipe,
3650 }; 3679 };
3651 static struct tracer *old_tracer;
3652 ssize_t ret; 3680 ssize_t ret;
3653 size_t rem; 3681 size_t rem;
3654 unsigned int i; 3682 unsigned int i;
@@ -3658,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3658 3686
3659 /* copy the tracer to avoid using a global lock all around */ 3687 /* copy the tracer to avoid using a global lock all around */
3660 mutex_lock(&trace_types_lock); 3688 mutex_lock(&trace_types_lock);
3661 if (unlikely(old_tracer != current_trace && current_trace)) { 3689 if (unlikely(iter->trace->name != current_trace->name))
3662 old_tracer = current_trace;
3663 *iter->trace = *current_trace; 3690 *iter->trace = *current_trace;
3664 }
3665 mutex_unlock(&trace_types_lock); 3691 mutex_unlock(&trace_types_lock);
3666 3692
3667 mutex_lock(&iter->mutex); 3693 mutex_lock(&iter->mutex);
@@ -4037,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4037 * Reset the buffer so that it doesn't have incomparable timestamps. 4063 * Reset the buffer so that it doesn't have incomparable timestamps.
4038 */ 4064 */
4039 tracing_reset_online_cpus(&global_trace); 4065 tracing_reset_online_cpus(&global_trace);
4040 if (max_tr.buffer) 4066 tracing_reset_online_cpus(&max_tr);
4041 tracing_reset_online_cpus(&max_tr);
4042 4067
4043 mutex_unlock(&trace_types_lock); 4068 mutex_unlock(&trace_types_lock);
4044 4069
@@ -4054,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4054 return single_open(file, tracing_clock_show, NULL); 4079 return single_open(file, tracing_clock_show, NULL);
4055} 4080}
4056 4081
4082#ifdef CONFIG_TRACER_SNAPSHOT
4083static int tracing_snapshot_open(struct inode *inode, struct file *file)
4084{
4085 struct trace_iterator *iter;
4086 int ret = 0;
4087
4088 if (file->f_mode & FMODE_READ) {
4089 iter = __tracing_open(inode, file, true);
4090 if (IS_ERR(iter))
4091 ret = PTR_ERR(iter);
4092 }
4093 return ret;
4094}
4095
4096static ssize_t
4097tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4098 loff_t *ppos)
4099{
4100 unsigned long val;
4101 int ret;
4102
4103 ret = tracing_update_buffers();
4104 if (ret < 0)
4105 return ret;
4106
4107 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4108 if (ret)
4109 return ret;
4110
4111 mutex_lock(&trace_types_lock);
4112
4113 if (current_trace->use_max_tr) {
4114 ret = -EBUSY;
4115 goto out;
4116 }
4117
4118 switch (val) {
4119 case 0:
4120 if (current_trace->allocated_snapshot) {
4121 /* free spare buffer */
4122 ring_buffer_resize(max_tr.buffer, 1,
4123 RING_BUFFER_ALL_CPUS);
4124 set_buffer_entries(&max_tr, 1);
4125 tracing_reset_online_cpus(&max_tr);
4126 current_trace->allocated_snapshot = false;
4127 }
4128 break;
4129 case 1:
4130 if (!current_trace->allocated_snapshot) {
4131 /* allocate spare buffer */
4132 ret = resize_buffer_duplicate_size(&max_tr,
4133 &global_trace, RING_BUFFER_ALL_CPUS);
4134 if (ret < 0)
4135 break;
4136 current_trace->allocated_snapshot = true;
4137 }
4138
4139 local_irq_disable();
4140 /* Now, we're going to swap */
4141 update_max_tr(&global_trace, current, smp_processor_id());
4142 local_irq_enable();
4143 break;
4144 default:
4145 if (current_trace->allocated_snapshot)
4146 tracing_reset_online_cpus(&max_tr);
4147 else
4148 ret = -EINVAL;
4149 break;
4150 }
4151
4152 if (ret >= 0) {
4153 *ppos += cnt;
4154 ret = cnt;
4155 }
4156out:
4157 mutex_unlock(&trace_types_lock);
4158 return ret;
4159}
4160#endif /* CONFIG_TRACER_SNAPSHOT */
4161
4162
4057static const struct file_operations tracing_max_lat_fops = { 4163static const struct file_operations tracing_max_lat_fops = {
4058 .open = tracing_open_generic, 4164 .open = tracing_open_generic,
4059 .read = tracing_max_lat_read, 4165 .read = tracing_max_lat_read,
@@ -4110,6 +4216,16 @@ static const struct file_operations trace_clock_fops = {
4110 .write = tracing_clock_write, 4216 .write = tracing_clock_write,
4111}; 4217};
4112 4218
4219#ifdef CONFIG_TRACER_SNAPSHOT
4220static const struct file_operations snapshot_fops = {
4221 .open = tracing_snapshot_open,
4222 .read = seq_read,
4223 .write = tracing_snapshot_write,
4224 .llseek = tracing_seek,
4225 .release = tracing_release,
4226};
4227#endif /* CONFIG_TRACER_SNAPSHOT */
4228
4113struct ftrace_buffer_info { 4229struct ftrace_buffer_info {
4114 struct trace_array *tr; 4230 struct trace_array *tr;
4115 void *spare; 4231 void *spare;
@@ -4414,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4414 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 4530 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4415 trace_seq_printf(s, "dropped events: %ld\n", cnt); 4531 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4416 4532
4533 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
4534 trace_seq_printf(s, "read events: %ld\n", cnt);
4535
4417 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4536 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4418 4537
4419 kfree(s); 4538 kfree(s);
@@ -4490,7 +4609,7 @@ struct dentry *tracing_init_dentry(void)
4490 4609
4491static struct dentry *d_percpu; 4610static struct dentry *d_percpu;
4492 4611
4493struct dentry *tracing_dentry_percpu(void) 4612static struct dentry *tracing_dentry_percpu(void)
4494{ 4613{
4495 static int once; 4614 static int once;
4496 struct dentry *d_tracer; 4615 struct dentry *d_tracer;
@@ -4906,6 +5025,11 @@ static __init int tracer_init_debugfs(void)
4906 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5025 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4907#endif 5026#endif
4908 5027
5028#ifdef CONFIG_TRACER_SNAPSHOT
5029 trace_create_file("snapshot", 0644, d_tracer,
5030 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5031#endif
5032
4909 create_trace_options_dir(); 5033 create_trace_options_dir();
4910 5034
4911 for_each_tracing_cpu(cpu) 5035 for_each_tracing_cpu(cpu)
@@ -5014,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5014 if (disable_tracing) 5138 if (disable_tracing)
5015 ftrace_kill(); 5139 ftrace_kill();
5016 5140
5141 /* Simulate the iterator */
5017 trace_init_global_iter(&iter); 5142 trace_init_global_iter(&iter);
5018 5143
5019 for_each_tracing_cpu(cpu) { 5144 for_each_tracing_cpu(cpu) {
@@ -5025,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5025 /* don't look at user memory in panic mode */ 5150 /* don't look at user memory in panic mode */
5026 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 5151 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
5027 5152
5028 /* Simulate the iterator */
5029 iter.tr = &global_trace;
5030 iter.trace = current_trace;
5031
5032 switch (oops_dump_mode) { 5153 switch (oops_dump_mode) {
5033 case DUMP_ALL: 5154 case DUMP_ALL:
5034 iter.cpu_file = TRACE_PIPE_ALL_CPU; 5155 iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5173,7 +5294,7 @@ __init static int tracer_alloc_buffers(void)
5173 init_irq_work(&trace_work_wakeup, trace_wake_up); 5294 init_irq_work(&trace_work_wakeup, trace_wake_up);
5174 5295
5175 register_tracer(&nop_trace); 5296 register_tracer(&nop_trace);
5176 current_trace = &nop_trace; 5297
5177 /* All seems OK, enable tracing */ 5298 /* All seems OK, enable tracing */
5178 tracing_disabled = 0; 5299 tracing_disabled = 0;
5179 5300
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 bool print_max; 288 bool print_max;
289 bool use_max_tr; 289 bool use_max_tr;
290 bool allocated_snapshot;
290}; 291};
291 292
292 293
293/* Only current can touch trace_recursion */ 294/* Only current can touch trace_recursion */
294#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
295#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
296 295
297/* Ring buffer has the 10 LSB bits to count */ 296/*
298#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) 297 * For function tracing recursion:
299 298 * The order of these bits are important.
300/* for function tracing recursion */ 299 *
301#define TRACE_INTERNAL_BIT (1<<11) 300 * When function tracing occurs, the following steps are made:
302#define TRACE_GLOBAL_BIT (1<<12) 301 * If arch does not support a ftrace feature:
303#define TRACE_CONTROL_BIT (1<<13) 302 * call internal function (uses INTERNAL bits) which calls...
303 * If callback is registered to the "global" list, the list
304 * function is called and recursion checks the GLOBAL bits.
305 * then this function calls...
306 * The function callback, which can use the FTRACE bits to
307 * check for recursion.
308 *
309 * Now if the arch does not suppport a feature, and it calls
310 * the global list function which calls the ftrace callback
311 * all three of these steps will do a recursion protection.
312 * There's no reason to do one if the previous caller already
313 * did. The recursion that we are protecting against will
314 * go through the same steps again.
315 *
316 * To prevent the multiple recursion checks, if a recursion
317 * bit is set that is higher than the MAX bit of the current
318 * check, then we know that the check was made by the previous
319 * caller, and we can skip the current check.
320 */
321enum {
322 TRACE_BUFFER_BIT,
323 TRACE_BUFFER_NMI_BIT,
324 TRACE_BUFFER_IRQ_BIT,
325 TRACE_BUFFER_SIRQ_BIT,
326
327 /* Start of function recursion bits */
328 TRACE_FTRACE_BIT,
329 TRACE_FTRACE_NMI_BIT,
330 TRACE_FTRACE_IRQ_BIT,
331 TRACE_FTRACE_SIRQ_BIT,
332
333 /* GLOBAL_BITs must be greater than FTRACE_BITs */
334 TRACE_GLOBAL_BIT,
335 TRACE_GLOBAL_NMI_BIT,
336 TRACE_GLOBAL_IRQ_BIT,
337 TRACE_GLOBAL_SIRQ_BIT,
338
339 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
340 TRACE_INTERNAL_BIT,
341 TRACE_INTERNAL_NMI_BIT,
342 TRACE_INTERNAL_IRQ_BIT,
343 TRACE_INTERNAL_SIRQ_BIT,
344
345 TRACE_CONTROL_BIT,
304 346
305/* 347/*
306 * Abuse of the trace_recursion. 348 * Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
309 * was called in irq context but we have irq tracing off. Since this 351 * was called in irq context but we have irq tracing off. Since this
310 * can only be modified by current, we can reuse trace_recursion. 352 * can only be modified by current, we can reuse trace_recursion.
311 */ 353 */
312#define TRACE_IRQ_BIT (1<<13) 354 TRACE_IRQ_BIT,
355};
356
357#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
358#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
359#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
360
361#define TRACE_CONTEXT_BITS 4
362
363#define TRACE_FTRACE_START TRACE_FTRACE_BIT
364#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
365
366#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
367#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
368
369#define TRACE_LIST_START TRACE_INTERNAL_BIT
370#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
371
372#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
373
374static __always_inline int trace_get_context_bit(void)
375{
376 int bit;
313 377
314#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) 378 if (in_interrupt()) {
315#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) 379 if (in_nmi())
316#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) 380 bit = 0;
381
382 else if (in_irq())
383 bit = 1;
384 else
385 bit = 2;
386 } else
387 bit = 3;
388
389 return bit;
390}
391
392static __always_inline int trace_test_and_set_recursion(int start, int max)
393{
394 unsigned int val = current->trace_recursion;
395 int bit;
396
397 /* A previous recursion check was made */
398 if ((val & TRACE_CONTEXT_MASK) > max)
399 return 0;
400
401 bit = trace_get_context_bit() + start;
402 if (unlikely(val & (1 << bit)))
403 return -1;
404
405 val |= 1 << bit;
406 current->trace_recursion = val;
407 barrier();
408
409 return bit;
410}
411
412static __always_inline void trace_clear_recursion(int bit)
413{
414 unsigned int val = current->trace_recursion;
415
416 if (!bit)
417 return;
418
419 bit = 1 << bit;
420 val &= ~bit;
421
422 barrier();
423 current->trace_recursion = val;
424}
317 425
318#define TRACE_PIPE_ALL_CPU -1 426#define TRACE_PIPE_ALL_CPU -1
319 427
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
21#include <linux/ktime.h> 21#include <linux/ktime.h>
22#include <linux/trace_clock.h> 22#include <linux/trace_clock.h>
23 23
24#include "trace.h"
25
26/* 24/*
27 * trace_clock_local(): the simplest and least coherent tracing clock. 25 * trace_clock_local(): the simplest and least coherent tracing clock.
28 * 26 *
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
44 42
45 return clock; 43 return clock;
46} 44}
45EXPORT_SYMBOL_GPL(trace_clock_local);
47 46
48/* 47/*
49 * trace_clock(): 'between' trace clock. Not completely serialized, 48 * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
86 local_irq_save(flags); 85 local_irq_save(flags);
87 86
88 this_cpu = raw_smp_processor_id(); 87 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 88 now = sched_clock_cpu(this_cpu);
90 /* 89 /*
91 * If in an NMI context then dont risk lockups and return the 90 * If in an NMI context then dont risk lockups and return the
92 * cpu_clock() time: 91 * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
120 119
121 return ret; 120 return ret;
122} 121}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48} 48}
49 49
50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{
54 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 long disabled;
58 int cpu;
59 int pc;
60
61 if (unlikely(!ftrace_function_enabled))
62 return;
63
64 pc = preempt_count();
65 preempt_disable_notrace();
66 local_save_flags(flags);
67 cpu = raw_smp_processor_id();
68 data = tr->data[cpu];
69 disabled = atomic_inc_return(&data->disabled);
70
71 if (likely(disabled == 1))
72 trace_function(tr, ip, parent_ip, flags, pc);
73
74 atomic_dec(&data->disabled);
75 preempt_enable_notrace();
76}
77
78/* Our option */ 50/* Our option */
79enum { 51enum {
80 TRACE_FUNC_OPT_STACK = 0x1, 52 TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
85static void 57static void
86function_trace_call(unsigned long ip, unsigned long parent_ip, 58function_trace_call(unsigned long ip, unsigned long parent_ip,
87 struct ftrace_ops *op, struct pt_regs *pt_regs) 59 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
89{ 60{
90 struct trace_array *tr = func_trace; 61 struct trace_array *tr = func_trace;
91 struct trace_array_cpu *data; 62 struct trace_array_cpu *data;
92 unsigned long flags; 63 unsigned long flags;
93 long disabled; 64 int bit;
94 int cpu; 65 int cpu;
95 int pc; 66 int pc;
96 67
97 if (unlikely(!ftrace_function_enabled)) 68 if (unlikely(!ftrace_function_enabled))
98 return; 69 return;
99 70
100 /* 71 pc = preempt_count();
101 * Need to use raw, since this must be called before the 72 preempt_disable_notrace();
102 * recursive protection is performed.
103 */
104 local_irq_save(flags);
105 cpu = raw_smp_processor_id();
106 data = tr->data[cpu];
107 disabled = atomic_inc_return(&data->disabled);
108 73
109 if (likely(disabled == 1)) { 74 bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
110 pc = preempt_count(); 75 if (bit < 0)
76 goto out;
77
78 cpu = smp_processor_id();
79 data = tr->data[cpu];
80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags);
111 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
112 } 83 }
84 trace_clear_recursion(bit);
113 85
114 atomic_dec(&data->disabled); 86 out:
115 local_irq_restore(flags); 87 preempt_enable_notrace();
116} 88}
117 89
118static void 90static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
185{ 157{
186 ftrace_function_enabled = 0; 158 ftrace_function_enabled = 0;
187 159
188 if (trace_flags & TRACE_ITER_PREEMPTONLY)
189 trace_ops.func = function_trace_call_preempt_only;
190 else
191 trace_ops.func = function_trace_call;
192
193 if (func_flags.val & TRACE_FUNC_OPT_STACK) 160 if (func_flags.val & TRACE_FUNC_OPT_STACK)
194 register_ftrace_function(&trace_stack_ops); 161 register_ftrace_function(&trace_stack_ops);
195 else 162 else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40 48#define TRACE_GRAPH_PRINT_IRQS 0x40
49 49
50static unsigned int max_depth;
51
50static struct tracer_opt trace_opts[] = { 52static struct tracer_opt trace_opts[] = {
51 /* Display overruns? (for self-debug purpose) */ 53 /* Display overruns? (for self-debug purpose) */
52 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 54 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
189 191
190 ftrace_pop_return_trace(&trace, &ret, frame_pointer); 192 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
191 trace.rettime = trace_clock_local(); 193 trace.rettime = trace_clock_local();
192 ftrace_graph_return(&trace);
193 barrier(); 194 barrier();
194 current->curr_ret_stack--; 195 current->curr_ret_stack--;
195 196
197 /*
198 * The trace should run after decrementing the ret counter
199 * in case an interrupt were to come in. We don't want to
200 * lose the interrupt if max_depth is set.
201 */
202 ftrace_graph_return(&trace);
203
196 if (unlikely(!ret)) { 204 if (unlikely(!ret)) {
197 ftrace_graph_stop(); 205 ftrace_graph_stop();
198 WARN_ON(1); 206 WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
250 return 0; 258 return 0;
251 259
252 /* trace it when it is-nested-in or is a function enabled. */ 260 /* trace it when it is-nested-in or is a function enabled. */
253 if (!(trace->depth || ftrace_graph_addr(trace->func)) || 261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
254 ftrace_graph_ignore_irqs()) 262 ftrace_graph_ignore_irqs()) ||
263 (max_depth && trace->depth >= max_depth))
255 return 0; 264 return 0;
256 265
257 local_irq_save(flags); 266 local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
1457#endif 1466#endif
1458}; 1467};
1459 1468
1469
1470static ssize_t
1471graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
1472 loff_t *ppos)
1473{
1474 unsigned long val;
1475 int ret;
1476
1477 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
1478 if (ret)
1479 return ret;
1480
1481 max_depth = val;
1482
1483 *ppos += cnt;
1484
1485 return cnt;
1486}
1487
1488static ssize_t
1489graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
1490 loff_t *ppos)
1491{
1492 char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
1493 int n;
1494
1495 n = sprintf(buf, "%d\n", max_depth);
1496
1497 return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
1498}
1499
1500static const struct file_operations graph_depth_fops = {
1501 .open = tracing_open_generic,
1502 .write = graph_depth_write,
1503 .read = graph_depth_read,
1504 .llseek = generic_file_llseek,
1505};
1506
1507static __init int init_graph_debugfs(void)
1508{
1509 struct dentry *d_tracer;
1510
1511 d_tracer = tracing_init_dentry();
1512 if (!d_tracer)
1513 return 0;
1514
1515 trace_create_file("max_graph_depth", 0644, d_tracer,
1516 NULL, &graph_depth_fops);
1517
1518 return 0;
1519}
1520fs_initcall(init_graph_debugfs);
1521
1460static __init int init_graph_trace(void) 1522static __init int init_graph_trace(void)
1461{ 1523{
1462 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1524 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
66#define TP_FLAG_TRACE 1 66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2 67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4 68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70 69
71 70
72/* data_rloc: data relative location, compatible with u32 */ 71/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca0..75aa97fbe1a1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,8 +15,8 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h>
18#include <trace/events/sched.h> 19#include <trace/events/sched.h>
19
20#include "trace.h" 20#include "trace.h"
21 21
22static struct trace_array *wakeup_trace; 22static struct trace_array *wakeup_trace;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
415 * The ftrace infrastructure should provide the recursion 415 * The ftrace infrastructure should provide the recursion
416 * protection. If not, this will crash the kernel! 416 * protection. If not, this will crash the kernel!
417 */ 417 */
418 trace_selftest_recursion_cnt++; 418 if (trace_selftest_recursion_cnt++ > 10)
419 return;
419 DYN_FTRACE_TEST_NAME(); 420 DYN_FTRACE_TEST_NAME();
420} 421}
421 422
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
452 char *func_name; 453 char *func_name;
453 int len; 454 int len;
454 int ret; 455 int ret;
455 int cnt;
456 456
457 /* The previous test PASSED */ 457 /* The previous test PASSED */
458 pr_cont("PASSED\n"); 458 pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
510 510
511 unregister_ftrace_function(&test_recsafe_probe); 511 unregister_ftrace_function(&test_recsafe_probe);
512 512
513 /*
514 * If arch supports all ftrace features, and no other task
515 * was on the list, we should be fine.
516 */
517 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
518 cnt = 2; /* Should have recursed */
519 else
520 cnt = 1;
521
522 ret = -1; 513 ret = -1;
523 if (trace_selftest_recursion_cnt != cnt) { 514 if (trace_selftest_recursion_cnt != 2) {
524 pr_cont("*callback not called expected %d times (%d)* ", 515 pr_cont("*callback not called expected 2 times (%d)* ",
525 cnt, trace_selftest_recursion_cnt); 516 trace_selftest_recursion_cnt);
526 goto out; 517 goto out;
527 } 518 }
528 519
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
568 int ret; 559 int ret;
569 int supported = 0; 560 int supported = 0;
570 561
571#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 562#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
572 supported = 1; 563 supported = 1;
573#endif 564#endif
574 565
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..5329e13e74a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
77 return syscalls_metadata[nr]; 77 return syscalls_metadata[nr];
78} 78}
79 79
80enum print_line_t 80static enum print_line_t
81print_syscall_enter(struct trace_iterator *iter, int flags, 81print_syscall_enter(struct trace_iterator *iter, int flags,
82 struct trace_event *event) 82 struct trace_event *event)
83{ 83{
@@ -130,7 +130,7 @@ end:
130 return TRACE_TYPE_HANDLED; 130 return TRACE_TYPE_HANDLED;
131} 131}
132 132
133enum print_line_t 133static enum print_line_t
134print_syscall_exit(struct trace_iterator *iter, int flags, 134print_syscall_exit(struct trace_iterator *iter, int flags,
135 struct trace_event *event) 135 struct trace_event *event)
136{ 136{
@@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
270 return ret; 270 return ret;
271} 271}
272 272
273void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 273static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
274{ 274{
275 struct syscall_trace_enter *entry; 275 struct syscall_trace_enter *entry;
276 struct syscall_metadata *sys_data; 276 struct syscall_metadata *sys_data;
@@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
305 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 305 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
306} 306}
307 307
308void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 308static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
309{ 309{
310 struct syscall_trace_exit *entry; 310 struct syscall_trace_exit *entry;
311 struct syscall_metadata *sys_data; 311 struct syscall_metadata *sys_data;
@@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
337 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 337 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
338} 338}
339 339
340int reg_event_syscall_enter(struct ftrace_event_call *call) 340static int reg_event_syscall_enter(struct ftrace_event_call *call)
341{ 341{
342 int ret = 0; 342 int ret = 0;
343 int num; 343 int num;
@@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
356 return ret; 356 return ret;
357} 357}
358 358
359void unreg_event_syscall_enter(struct ftrace_event_call *call) 359static void unreg_event_syscall_enter(struct ftrace_event_call *call)
360{ 360{
361 int num; 361 int num;
362 362
@@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
371 mutex_unlock(&syscall_trace_lock); 371 mutex_unlock(&syscall_trace_lock);
372} 372}
373 373
374int reg_event_syscall_exit(struct ftrace_event_call *call) 374static int reg_event_syscall_exit(struct ftrace_event_call *call)
375{ 375{
376 int ret = 0; 376 int ret = 0;
377 int num; 377 int num;
@@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
390 return ret; 390 return ret;
391} 391}
392 392
393void unreg_event_syscall_exit(struct ftrace_event_call *call) 393static void unreg_event_syscall_exit(struct ftrace_event_call *call)
394{ 394{
395 int num; 395 int num;
396 396
@@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
459 return (unsigned long)sys_call_table[nr]; 459 return (unsigned long)sys_call_table[nr];
460} 460}
461 461
462int __init init_ftrace_syscalls(void) 462static int __init init_ftrace_syscalls(void)
463{ 463{
464 struct syscall_metadata *meta; 464 struct syscall_metadata *meta;
465 unsigned long addr; 465 unsigned long addr;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4f67fb..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,20 +28,21 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct trace_uprobe_filter {
32 rwlock_t rwlock;
33 int nr_systemwide;
34 struct list_head perf_events;
35};
36
31/* 37/*
32 * uprobe event core functions 38 * uprobe event core functions
33 */ 39 */
34struct trace_uprobe;
35struct uprobe_trace_consumer {
36 struct uprobe_consumer cons;
37 struct trace_uprobe *tu;
38};
39
40struct trace_uprobe { 40struct trace_uprobe {
41 struct list_head list; 41 struct list_head list;
42 struct ftrace_event_class class; 42 struct ftrace_event_class class;
43 struct ftrace_event_call call; 43 struct ftrace_event_call call;
44 struct uprobe_trace_consumer *consumer; 44 struct trace_uprobe_filter filter;
45 struct uprobe_consumer consumer;
45 struct inode *inode; 46 struct inode *inode;
46 char *filename; 47 char *filename;
47 unsigned long offset; 48 unsigned long offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
64 65
65static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
66 67
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{
70 rwlock_init(&filter->rwlock);
71 filter->nr_systemwide = 0;
72 INIT_LIST_HEAD(&filter->perf_events);
73}
74
75static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
76{
77 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78}
79
67/* 80/*
68 * Allocate new trace_uprobe and initialize it (including uprobes). 81 * Allocate new trace_uprobe and initialize it (including uprobes).
69 */ 82 */
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
92 goto error; 105 goto error;
93 106
94 INIT_LIST_HEAD(&tu->list); 107 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter);
95 return tu; 110 return tu;
96 111
97error: 112error:
@@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
253 if (ret) 268 if (ret)
254 goto fail_address_parse; 269 goto fail_address_parse;
255 270
271 inode = igrab(path.dentry->d_inode);
272 path_put(&path);
273
274 if (!inode || !S_ISREG(inode->i_mode)) {
275 ret = -EINVAL;
276 goto fail_address_parse;
277 }
278
256 ret = kstrtoul(arg, 0, &offset); 279 ret = kstrtoul(arg, 0, &offset);
257 if (ret) 280 if (ret)
258 goto fail_address_parse; 281 goto fail_address_parse;
259 282
260 inode = igrab(path.dentry->d_inode);
261
262 argc -= 2; 283 argc -= 2;
263 argv += 2; 284 argv += 2;
264 285
@@ -356,7 +377,7 @@ fail_address_parse:
356 if (inode) 377 if (inode)
357 iput(inode); 378 iput(inode);
358 379
359 pr_info("Failed to parse address.\n"); 380 pr_info("Failed to parse address or file.\n");
360 381
361 return ret; 382 return ret;
362} 383}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
465}; 486};
466 487
467/* uprobe handler */ 488/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{ 490{
470 struct uprobe_trace_entry_head *entry; 491 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event; 492 struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
475 unsigned long irq_flags; 496 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call; 497 struct ftrace_event_call *call = &tu->call;
477 498
478 tu->nhit++;
479
480 local_save_flags(irq_flags); 499 local_save_flags(irq_flags);
481 pc = preempt_count(); 500 pc = preempt_count();
482 501
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc); 505 size, irq_flags, pc);
487 if (!event) 506 if (!event)
488 return; 507 return 0;
489 508
490 entry = ring_buffer_event_data(event); 509 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 510 entry->ip = instruction_pointer(task_pt_regs(current));
492 data = (u8 *)&entry[1]; 511 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++) 512 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495 514
496 if (!filter_current_check_discard(buffer, call, entry, event)) 515 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
517
518 return 0;
498} 519}
499 520
500/* Event entry printers */ 521/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
533 return TRACE_TYPE_PARTIAL_LINE; 554 return TRACE_TYPE_PARTIAL_LINE;
534} 555}
535 556
536static int probe_event_enable(struct trace_uprobe *tu, int flag) 557static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
537{ 558{
538 struct uprobe_trace_consumer *utc; 559 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
539 int ret = 0; 560}
540 561
541 if (!tu->inode || tu->consumer) 562typedef bool (*filter_func_t)(struct uprobe_consumer *self,
542 return -EINTR; 563 enum uprobe_filter_ctx ctx,
564 struct mm_struct *mm);
543 565
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); 566static int
545 if (!utc) 567probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
568{
569 int ret = 0;
570
571 if (is_trace_uprobe_enabled(tu))
546 return -EINTR; 572 return -EINTR;
547 573
548 utc->cons.handler = uprobe_dispatcher; 574 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555 575
556 tu->flags |= flag; 576 tu->flags |= flag;
557 utc->tu = tu; 577 tu->consumer.filter = filter;
558 tu->consumer = utc; 578 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
579 if (ret)
580 tu->flags &= ~flag;
559 581
560 return 0; 582 return ret;
561} 583}
562 584
563static void probe_event_disable(struct trace_uprobe *tu, int flag) 585static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{ 586{
565 if (!tu->inode || !tu->consumer) 587 if (!is_trace_uprobe_enabled(tu))
566 return; 588 return;
567 589
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); 590 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
591
592 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
569 tu->flags &= ~flag; 593 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572} 594}
573 595
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 596static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
642} 664}
643 665
644#ifdef CONFIG_PERF_EVENTS 666#ifdef CONFIG_PERF_EVENTS
667static bool
668__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
669{
670 struct perf_event *event;
671
672 if (filter->nr_systemwide)
673 return true;
674
675 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
676 if (event->hw.tp_target->mm == mm)
677 return true;
678 }
679
680 return false;
681}
682
683static inline bool
684uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
685{
686 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
687}
688
689static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
690{
691 bool done;
692
693 write_lock(&tu->filter.rwlock);
694 if (event->hw.tp_target) {
695 /*
696 * event->parent != NULL means copy_process(), we can avoid
697 * uprobe_apply(). current->mm must be probed and we can rely
698 * on dup_mmap() which preserves the already installed bp's.
699 *
700 * attr.enable_on_exec means that exec/mmap will install the
701 * breakpoints we need.
702 */
703 done = tu->filter.nr_systemwide ||
704 event->parent || event->attr.enable_on_exec ||
705 uprobe_filter_event(tu, event);
706 list_add(&event->hw.tp_list, &tu->filter.perf_events);
707 } else {
708 done = tu->filter.nr_systemwide;
709 tu->filter.nr_systemwide++;
710 }
711 write_unlock(&tu->filter.rwlock);
712
713 if (!done)
714 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
715
716 return 0;
717}
718
719static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
720{
721 bool done;
722
723 write_lock(&tu->filter.rwlock);
724 if (event->hw.tp_target) {
725 list_del(&event->hw.tp_list);
726 done = tu->filter.nr_systemwide ||
727 (event->hw.tp_target->flags & PF_EXITING) ||
728 uprobe_filter_event(tu, event);
729 } else {
730 tu->filter.nr_systemwide--;
731 done = tu->filter.nr_systemwide;
732 }
733 write_unlock(&tu->filter.rwlock);
734
735 if (!done)
736 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
737
738 return 0;
739}
740
741static bool uprobe_perf_filter(struct uprobe_consumer *uc,
742 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
743{
744 struct trace_uprobe *tu;
745 int ret;
746
747 tu = container_of(uc, struct trace_uprobe, consumer);
748 read_lock(&tu->filter.rwlock);
749 ret = __uprobe_perf_filter(&tu->filter, mm);
750 read_unlock(&tu->filter.rwlock);
751
752 return ret;
753}
754
645/* uprobe profile handler */ 755/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{ 757{
648 struct ftrace_event_call *call = &tu->call; 758 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry; 759 struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
652 int size, __size, i; 762 int size, __size, i;
653 int rctx; 763 int rctx;
654 764
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
766 return UPROBE_HANDLER_REMOVE;
767
655 __size = sizeof(*entry) + tu->size; 768 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32); 770 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return; 772 return 0;
660 773
661 preempt_disable(); 774 preempt_disable();
662 775
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
664 if (!entry) 777 if (!entry)
665 goto out; 778 goto out;
666 779
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 780 entry->ip = instruction_pointer(task_pt_regs(current));
668 data = (u8 *)&entry[1]; 781 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++) 782 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
674 787
675 out: 788 out:
676 preempt_enable(); 789 preempt_enable();
790 return 0;
677} 791}
678#endif /* CONFIG_PERF_EVENTS */ 792#endif /* CONFIG_PERF_EVENTS */
679 793
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
684 798
685 switch (type) { 799 switch (type) {
686 case TRACE_REG_REGISTER: 800 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE); 801 return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
688 802
689 case TRACE_REG_UNREGISTER: 803 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE); 804 probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
692 806
693#ifdef CONFIG_PERF_EVENTS 807#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER: 808 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE); 809 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
696 810
697 case TRACE_REG_PERF_UNREGISTER: 811 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE); 812 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0; 813 return 0;
814
815 case TRACE_REG_PERF_OPEN:
816 return uprobe_perf_open(tu, data);
817
818 case TRACE_REG_PERF_CLOSE:
819 return uprobe_perf_close(tu, data);
820
700#endif 821#endif
701 default: 822 default:
702 return 0; 823 return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
706 827
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 828static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{ 829{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu; 830 struct trace_uprobe *tu;
831 int ret = 0;
711 832
712 utc = container_of(con, struct uprobe_trace_consumer, cons); 833 tu = container_of(con, struct trace_uprobe, consumer);
713 tu = utc->tu; 834 tu->nhit++;
714 if (!tu || tu->consumer != utc)
715 return 0;
716 835
717 if (tu->flags & TP_FLAG_TRACE) 836 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs); 837 ret |= uprobe_trace_func(tu, regs);
719 838
720#ifdef CONFIG_PERF_EVENTS 839#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE) 840 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs); 841 ret |= uprobe_perf_func(tu, regs);
723#endif 842#endif
724 return 0; 843 return ret;
725} 844}
726 845
727static struct trace_event_functions uprobe_funcs = { 846static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
32{ 32{
33 const struct cred *tcred; 33 const struct cred *tcred;
34 struct timespec uptime, ts; 34 struct timespec uptime, ts;
35 cputime_t utime, stime, utimescaled, stimescaled;
35 u64 ac_etime; 36 u64 ac_etime;
36 37
37 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 38 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
65 stats->ac_ppid = pid_alive(tsk) ? 66 stats->ac_ppid = pid_alive(tsk) ?
66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; 67 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
67 rcu_read_unlock(); 68 rcu_read_unlock();
68 stats->ac_utime = cputime_to_usecs(tsk->utime); 69
69 stats->ac_stime = cputime_to_usecs(tsk->stime); 70 task_cputime(tsk, &utime, &stime);
70 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); 71 stats->ac_utime = cputime_to_usecs(utime);
71 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); 72 stats->ac_stime = cputime_to_usecs(stime);
73
74 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
75 stats->ac_utimescaled = cputime_to_usecs(utimescaled);
76 stats->ac_stimescaled = cputime_to_usecs(stimescaled);
77
72 stats->ac_minflt = tsk->min_flt; 78 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 79 stats->ac_majflt = tsk->maj_flt;
74 80
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
115#undef KB 121#undef KB
116#undef MB 122#undef MB
117 123
118/** 124static void __acct_update_integrals(struct task_struct *tsk,
119 * acct_update_integrals - update mm integral fields in task_struct 125 cputime_t utime, cputime_t stime)
120 * @tsk: task_struct for accounting
121 */
122void acct_update_integrals(struct task_struct *tsk)
123{ 126{
124 if (likely(tsk->mm)) { 127 if (likely(tsk->mm)) {
125 cputime_t time, dtime; 128 cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
128 u64 delta; 131 u64 delta;
129 132
130 local_irq_save(flags); 133 local_irq_save(flags);
131 time = tsk->stime + tsk->utime; 134 time = stime + utime;
132 dtime = time - tsk->acct_timexpd; 135 dtime = time - tsk->acct_timexpd;
133 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 136 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
134 delta = value.tv_sec; 137 delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
145} 148}
146 149
147/** 150/**
151 * acct_update_integrals - update mm integral fields in task_struct
152 * @tsk: task_struct for accounting
153 */
154void acct_update_integrals(struct task_struct *tsk)
155{
156 cputime_t utime, stime;
157
158 task_cputime(tsk, &utime, &stime);
159 __acct_update_integrals(tsk, utime, stime);
160}
161
162/**
163 * acct_account_cputime - update mm integral after cputime update
164 * @tsk: task_struct for accounting
165 */
166void acct_account_cputime(struct task_struct *tsk)
167{
168 __acct_update_integrals(tsk, tsk->utime, tsk->stime);
169}
170
171/**
148 * acct_clear_integrals - clear the mm integral fields in task_struct 172 * acct_clear_integrals - clear the mm integral fields in task_struct
149 * @tsk: task_struct whose accounting fields are cleared 173 * @tsk: task_struct whose accounting fields are cleared
150 */ 174 */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b02..27689422aa92 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h> 25#include <linux/smpboot.h>
26#include <linux/sched/rt.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c3..f4feacad3812 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/hashtable.h>
44 45
45#include "workqueue_sched.h" 46#include "workqueue_internal.h"
46 47
47enum { 48enum {
48 /* 49 /*
49 * global_cwq flags 50 * worker_pool flags
50 * 51 *
51 * A bound gcwq is either associated or disassociated with its CPU. 52 * A bound pool is either associated or disassociated with its CPU.
52 * While associated (!DISASSOCIATED), all workers are bound to the 53 * While associated (!DISASSOCIATED), all workers are bound to the
53 * CPU and none has %WORKER_UNBOUND set and concurrency management 54 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect. 55 * is in effect.
55 * 56 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have 57 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may 58 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one. 59 * be executing on any CPU. The pool behaves as an unbound one.
59 * 60 *
60 * Note that DISASSOCIATED can be flipped only while holding 61 * Note that DISASSOCIATED can be flipped only while holding
61 * assoc_mutex of all pools on the gcwq to avoid changing binding 62 * assoc_mutex to avoid changing binding state while
62 * state while create_worker() is in progress. 63 * create_worker() is in progress.
63 */ 64 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ 66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */
70 69
71 /* worker flags */ 70 /* worker flags */
72 WORKER_STARTED = 1 << 0, /* started */ 71 WORKER_STARTED = 1 << 0, /* started */
@@ -79,11 +78,9 @@ enum {
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
80 WORKER_CPU_INTENSIVE, 79 WORKER_CPU_INTENSIVE,
81 80
82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
83 82
84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
86 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
87 84
88 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
89 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 86 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
111 * P: Preemption protected. Disabling preemption is enough and should 108 * P: Preemption protected. Disabling preemption is enough and should
112 * only be modified and accessed from the local cpu. 109 * only be modified and accessed from the local cpu.
113 * 110 *
114 * L: gcwq->lock protected. Access with gcwq->lock held. 111 * L: pool->lock protected. Access with pool->lock held.
115 * 112 *
116 * X: During normal operation, modification requires gcwq->lock and 113 * X: During normal operation, modification requires pool->lock and should
117 * should be done only from local cpu. Either disabling preemption 114 * be done only from local cpu. Either disabling preemption on local
118 * on local cpu or grabbing gcwq->lock is enough for read access. 115 * cpu or grabbing pool->lock is enough for read access. If
119 * If GCWQ_DISASSOCIATED is set, it's identical to L. 116 * POOL_DISASSOCIATED is set, it's identical to L.
120 * 117 *
121 * F: wq->flush_mutex protected. 118 * F: wq->flush_mutex protected.
122 * 119 *
123 * W: workqueue_lock protected. 120 * W: workqueue_lock protected.
124 */ 121 */
125 122
126struct global_cwq; 123/* struct worker is defined in workqueue_internal.h */
127struct worker_pool;
128
129/*
130 * The poor guys doing the actual heavy lifting. All on-duty workers
131 * are either serving the manager role, on idle list or on busy hash.
132 */
133struct worker {
134 /* on idle list while idle, on busy hash table while busy */
135 union {
136 struct list_head entry; /* L: while idle */
137 struct hlist_node hentry; /* L: while busy */
138 };
139
140 struct work_struct *current_work; /* L: work being processed */
141 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
142 struct list_head scheduled; /* L: scheduled works */
143 struct task_struct *task; /* I: worker task */
144 struct worker_pool *pool; /* I: the associated pool */
145 /* 64 bytes boundary on 64bit, 32 on 32bit */
146 unsigned long last_active; /* L: last active timestamp */
147 unsigned int flags; /* X: flags */
148 int id; /* I: worker id */
149
150 /* for rebinding worker to CPU */
151 struct work_struct rebind_work; /* L: for busy worker */
152};
153 124
154struct worker_pool { 125struct worker_pool {
155 struct global_cwq *gcwq; /* I: the owning gcwq */ 126 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */
128 int id; /* I: pool ID */
156 unsigned int flags; /* X: flags */ 129 unsigned int flags; /* X: flags */
157 130
158 struct list_head worklist; /* L: list of pending works */ 131 struct list_head worklist; /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
165 struct timer_list idle_timer; /* L: worker idle timeout */ 138 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */ 139 struct timer_list mayday_timer; /* L: SOS timer for workers */
167 140
168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ 141 /* workers are chained either in busy_hash or idle_list */
169 struct ida worker_ida; /* L: for worker IDs */ 142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
170};
171
172/*
173 * Global per-cpu workqueue. There's one and only one for each cpu
174 * and all works are queued and processed here regardless of their
175 * target workqueues.
176 */
177struct global_cwq {
178 spinlock_t lock; /* the gcwq lock */
179 unsigned int cpu; /* I: the associated cpu */
180 unsigned int flags; /* L: GCWQ_* flags */
181
182 /* workers are chained either in busy_hash or pool idle_list */
183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
184 /* L: hash of busy workers */ 143 /* L: hash of busy workers */
185 144
186 struct worker_pool pools[NR_WORKER_POOLS]; 145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
187 /* normal and highpri pools */ 146 struct ida worker_ida; /* L: for worker IDs */
147
148 /*
149 * The current concurrency level. As it's likely to be accessed
150 * from other CPUs during try_to_wake_up(), put it in a separate
151 * cacheline.
152 */
153 atomic_t nr_running ____cacheline_aligned_in_smp;
188} ____cacheline_aligned_in_smp; 154} ____cacheline_aligned_in_smp;
189 155
190/* 156/*
191 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of 157 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
192 * work_struct->data are used for flags and thus cwqs need to be 158 * of work_struct->data are used for flags and the remaining high bits
193 * aligned at two's power of the number of flag bits. 159 * point to the pwq; thus, pwqs need to be aligned at two's power of the
160 * number of flag bits.
194 */ 161 */
195struct cpu_workqueue_struct { 162struct pool_workqueue {
196 struct worker_pool *pool; /* I: the associated pool */ 163 struct worker_pool *pool; /* I: the associated pool */
197 struct workqueue_struct *wq; /* I: the owning workqueue */ 164 struct workqueue_struct *wq; /* I: the owning workqueue */
198 int work_color; /* L: current color */ 165 int work_color; /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
241struct workqueue_struct { 208struct workqueue_struct {
242 unsigned int flags; /* W: WQ_* flags */ 209 unsigned int flags; /* W: WQ_* flags */
243 union { 210 union {
244 struct cpu_workqueue_struct __percpu *pcpu; 211 struct pool_workqueue __percpu *pcpu;
245 struct cpu_workqueue_struct *single; 212 struct pool_workqueue *single;
246 unsigned long v; 213 unsigned long v;
247 } cpu_wq; /* I: cwq's */ 214 } pool_wq; /* I: pwq's */
248 struct list_head list; /* W: list of all workqueues */ 215 struct list_head list; /* W: list of all workqueues */
249 216
250 struct mutex flush_mutex; /* protects wq flushing */ 217 struct mutex flush_mutex; /* protects wq flushing */
251 int work_color; /* F: current work color */ 218 int work_color; /* F: current work color */
252 int flush_color; /* F: current flush color */ 219 int flush_color; /* F: current flush color */
253 atomic_t nr_cwqs_to_flush; /* flush in progress */ 220 atomic_t nr_pwqs_to_flush; /* flush in progress */
254 struct wq_flusher *first_flusher; /* F: first flusher */ 221 struct wq_flusher *first_flusher; /* F: first flusher */
255 struct list_head flusher_queue; /* F: flush waiters */ 222 struct list_head flusher_queue; /* F: flush waiters */
256 struct list_head flusher_overflow; /* F: flush overflow list */ 223 struct list_head flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
259 struct worker *rescuer; /* I: rescue worker */ 226 struct worker *rescuer; /* I: rescue worker */
260 227
261 int nr_drainers; /* W: drain in progress */ 228 int nr_drainers; /* W: drain in progress */
262 int saved_max_active; /* W: saved cwq max_active */ 229 int saved_max_active; /* W: saved pwq max_active */
263#ifdef CONFIG_LOCKDEP 230#ifdef CONFIG_LOCKDEP
264 struct lockdep_map lockdep_map; 231 struct lockdep_map lockdep_map;
265#endif 232#endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
280#define CREATE_TRACE_POINTS 247#define CREATE_TRACE_POINTS
281#include <trace/events/workqueue.h> 248#include <trace/events/workqueue.h>
282 249
283#define for_each_worker_pool(pool, gcwq) \ 250#define for_each_std_worker_pool(pool, cpu) \
284 for ((pool) = &(gcwq)->pools[0]; \ 251 for ((pool) = &std_worker_pools(cpu)[0]; \
285 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) 252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
286 253
287#define for_each_busy_worker(worker, i, pos, gcwq) \ 254#define for_each_busy_worker(worker, i, pos, pool) \
288 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 255 hash_for_each(pool->busy_hash, i, pos, worker, hentry)
289 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
290 256
291static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, 257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
292 unsigned int sw) 258 unsigned int sw)
293{ 259{
294 if (cpu < nr_cpu_ids) { 260 if (cpu < nr_cpu_ids) {
295 if (sw & 1) { 261 if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
300 if (sw & 2) 266 if (sw & 2)
301 return WORK_CPU_UNBOUND; 267 return WORK_CPU_UNBOUND;
302 } 268 }
303 return WORK_CPU_NONE; 269 return WORK_CPU_END;
304} 270}
305 271
306static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
307 struct workqueue_struct *wq) 273 struct workqueue_struct *wq)
308{ 274{
309 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
310} 276}
311 277
312/* 278/*
313 * CPU iterators 279 * CPU iterators
314 * 280 *
315 * An extra gcwq is defined for an invalid cpu number 281 * An extra cpu number is defined using an invalid cpu number
316 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
317 * specific CPU. The following iterators are similar to 283 * specific CPU. The following iterators are similar to for_each_*_cpu()
318 * for_each_*_cpu() iterators but also considers the unbound gcwq. 284 * iterators but also considers the unbound CPU.
319 * 285 *
320 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND 286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
321 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND 287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
322 * for_each_cwq_cpu() : possible CPUs for bound workqueues, 288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
323 * WORK_CPU_UNBOUND for unbound workqueues 289 * WORK_CPU_UNBOUND for unbound workqueues
324 */ 290 */
325#define for_each_gcwq_cpu(cpu) \ 291#define for_each_wq_cpu(cpu) \
326 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ 292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
327 (cpu) < WORK_CPU_NONE; \ 293 (cpu) < WORK_CPU_END; \
328 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) 294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
329 295
330#define for_each_online_gcwq_cpu(cpu) \ 296#define for_each_online_wq_cpu(cpu) \
331 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ 297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
332 (cpu) < WORK_CPU_NONE; \ 298 (cpu) < WORK_CPU_END; \
333 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) 299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
334 300
335#define for_each_cwq_cpu(cpu, wq) \ 301#define for_each_pwq_cpu(cpu, wq) \
336 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ 302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \
337 (cpu) < WORK_CPU_NONE; \ 303 (cpu) < WORK_CPU_END; \
338 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
339 305
340#ifdef CONFIG_DEBUG_OBJECTS_WORK 306#ifdef CONFIG_DEBUG_OBJECTS_WORK
341 307
@@ -459,57 +425,69 @@ static LIST_HEAD(workqueues);
459static bool workqueue_freezing; /* W: have wqs started freezing? */ 425static bool workqueue_freezing; /* W: have wqs started freezing? */
460 426
461/* 427/*
462 * The almighty global cpu workqueues. nr_running is the only field 428 * The CPU and unbound standard worker pools. The unbound ones have
463 * which is expected to be used frequently by other cpus via 429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
464 * try_to_wake_up(). Put it in a separate cacheline.
465 */ 430 */
466static DEFINE_PER_CPU(struct global_cwq, global_cwq); 431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
467static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); 432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
468 434
469/* 435/* idr of all pools */
470 * Global cpu workqueue and nr_running counter for unbound gcwq. The 436static DEFINE_MUTEX(worker_pool_idr_mutex);
471 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its 437static DEFINE_IDR(worker_pool_idr);
472 * workers have WORKER_UNBOUND set.
473 */
474static struct global_cwq unbound_global_cwq;
475static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
476 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
477};
478 438
479static int worker_thread(void *__worker); 439static int worker_thread(void *__worker);
480 440
481static int worker_pool_pri(struct worker_pool *pool) 441static struct worker_pool *std_worker_pools(int cpu)
482{ 442{
483 return pool - pool->gcwq->pools; 443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
484} 447}
485 448
486static struct global_cwq *get_gcwq(unsigned int cpu) 449static int std_worker_pool_pri(struct worker_pool *pool)
487{ 450{
488 if (cpu != WORK_CPU_UNBOUND) 451 return pool - std_worker_pools(pool->cpu);
489 return &per_cpu(global_cwq, cpu);
490 else
491 return &unbound_global_cwq;
492} 452}
493 453
494static atomic_t *get_pool_nr_running(struct worker_pool *pool) 454/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool)
495{ 456{
496 int cpu = pool->gcwq->cpu; 457 int ret;
497 int idx = worker_pool_pri(pool);
498 458
499 if (cpu != WORK_CPU_UNBOUND) 459 mutex_lock(&worker_pool_idr_mutex);
500 return &per_cpu(pool_nr_running, cpu)[idx]; 460 idr_pre_get(&worker_pool_idr, GFP_KERNEL);
501 else 461 ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
502 return &unbound_pool_nr_running[idx]; 462 mutex_unlock(&worker_pool_idr_mutex);
463
464 return ret;
503} 465}
504 466
505static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 467/*
506 struct workqueue_struct *wq) 468 * Lookup worker_pool by id. The idr currently is built during boot and
469 * never modified. Don't worry about locking for now.
470 */
471static struct worker_pool *worker_pool_by_id(int pool_id)
472{
473 return idr_find(&worker_pool_idr, pool_id);
474}
475
476static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
477{
478 struct worker_pool *pools = std_worker_pools(cpu);
479
480 return &pools[highpri];
481}
482
483static struct pool_workqueue *get_pwq(unsigned int cpu,
484 struct workqueue_struct *wq)
507{ 485{
508 if (!(wq->flags & WQ_UNBOUND)) { 486 if (!(wq->flags & WQ_UNBOUND)) {
509 if (likely(cpu < nr_cpu_ids)) 487 if (likely(cpu < nr_cpu_ids))
510 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 488 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
511 } else if (likely(cpu == WORK_CPU_UNBOUND)) 489 } else if (likely(cpu == WORK_CPU_UNBOUND))
512 return wq->cpu_wq.single; 490 return wq->pool_wq.single;
513 return NULL; 491 return NULL;
514} 492}
515 493
@@ -530,19 +508,19 @@ static int work_next_color(int color)
530} 508}
531 509
532/* 510/*
533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data 511 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
534 * contain the pointer to the queued cwq. Once execution starts, the flag 512 * contain the pointer to the queued pwq. Once execution starts, the flag
535 * is cleared and the high bits contain OFFQ flags and CPU number. 513 * is cleared and the high bits contain OFFQ flags and pool ID.
536 * 514 *
537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() 515 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
538 * and clear_work_data() can be used to set the cwq, cpu or clear 516 * and clear_work_data() can be used to set the pwq, pool or clear
539 * work->data. These functions should only be called while the work is 517 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set. 518 * owned - ie. while the PENDING bit is set.
541 * 519 *
542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to 520 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
543 * a work. gcwq is available once the work has been queued anywhere after 521 * corresponding to a work. Pool is available once the work has been
544 * initialization until it is sync canceled. cwq is available only while 522 * queued anywhere after initialization until it is sync canceled. pwq is
545 * the work item is queued. 523 * available only while the work item is queued.
546 * 524 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 525 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548 * canceled. While being canceled, a work item may have its PENDING set 526 * canceled. While being canceled, a work item may have its PENDING set
@@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
556 atomic_long_set(&work->data, data | flags | work_static(work)); 534 atomic_long_set(&work->data, data | flags | work_static(work));
557} 535}
558 536
559static void set_work_cwq(struct work_struct *work, 537static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
560 struct cpu_workqueue_struct *cwq,
561 unsigned long extra_flags) 538 unsigned long extra_flags)
562{ 539{
563 set_work_data(work, (unsigned long)cwq, 540 set_work_data(work, (unsigned long)pwq,
564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 541 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
565} 542}
566 543
567static void set_work_cpu_and_clear_pending(struct work_struct *work, 544static void set_work_pool_and_keep_pending(struct work_struct *work,
568 unsigned int cpu) 545 int pool_id)
546{
547 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
548 WORK_STRUCT_PENDING);
549}
550
551static void set_work_pool_and_clear_pending(struct work_struct *work,
552 int pool_id)
569{ 553{
570 /* 554 /*
571 * The following wmb is paired with the implied mb in 555 * The following wmb is paired with the implied mb in
@@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
574 * owner. 558 * owner.
575 */ 559 */
576 smp_wmb(); 560 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); 561 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
578} 562}
579 563
580static void clear_work_data(struct work_struct *work) 564static void clear_work_data(struct work_struct *work)
581{ 565{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */ 566 smp_wmb(); /* see set_work_pool_and_clear_pending() */
583 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 567 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
584} 568}
585 569
586static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) 570static struct pool_workqueue *get_work_pwq(struct work_struct *work)
587{ 571{
588 unsigned long data = atomic_long_read(&work->data); 572 unsigned long data = atomic_long_read(&work->data);
589 573
590 if (data & WORK_STRUCT_CWQ) 574 if (data & WORK_STRUCT_PWQ)
591 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); 575 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
592 else 576 else
593 return NULL; 577 return NULL;
594} 578}
595 579
596static struct global_cwq *get_work_gcwq(struct work_struct *work) 580/**
581 * get_work_pool - return the worker_pool a given work was associated with
582 * @work: the work item of interest
583 *
584 * Return the worker_pool @work was last associated with. %NULL if none.
585 */
586static struct worker_pool *get_work_pool(struct work_struct *work)
597{ 587{
598 unsigned long data = atomic_long_read(&work->data); 588 unsigned long data = atomic_long_read(&work->data);
599 unsigned int cpu; 589 struct worker_pool *pool;
590 int pool_id;
600 591
601 if (data & WORK_STRUCT_CWQ) 592 if (data & WORK_STRUCT_PWQ)
602 return ((struct cpu_workqueue_struct *) 593 return ((struct pool_workqueue *)
603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 594 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
604 595
605 cpu = data >> WORK_OFFQ_CPU_SHIFT; 596 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
606 if (cpu == WORK_CPU_NONE) 597 if (pool_id == WORK_OFFQ_POOL_NONE)
607 return NULL; 598 return NULL;
608 599
609 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); 600 pool = worker_pool_by_id(pool_id);
610 return get_gcwq(cpu); 601 WARN_ON_ONCE(!pool);
602 return pool;
603}
604
605/**
606 * get_work_pool_id - return the worker pool ID a given work is associated with
607 * @work: the work item of interest
608 *
609 * Return the worker_pool ID @work was last associated with.
610 * %WORK_OFFQ_POOL_NONE if none.
611 */
612static int get_work_pool_id(struct work_struct *work)
613{
614 unsigned long data = atomic_long_read(&work->data);
615
616 if (data & WORK_STRUCT_PWQ)
617 return ((struct pool_workqueue *)
618 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
619
620 return data >> WORK_OFFQ_POOL_SHIFT;
611} 621}
612 622
613static void mark_work_canceling(struct work_struct *work) 623static void mark_work_canceling(struct work_struct *work)
614{ 624{
615 struct global_cwq *gcwq = get_work_gcwq(work); 625 unsigned long pool_id = get_work_pool_id(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617 626
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, 627 pool_id <<= WORK_OFFQ_POOL_SHIFT;
619 WORK_STRUCT_PENDING); 628 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
620} 629}
621 630
622static bool work_is_canceling(struct work_struct *work) 631static bool work_is_canceling(struct work_struct *work)
623{ 632{
624 unsigned long data = atomic_long_read(&work->data); 633 unsigned long data = atomic_long_read(&work->data);
625 634
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); 635 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
627} 636}
628 637
629/* 638/*
630 * Policy functions. These define the policies on how the global worker 639 * Policy functions. These define the policies on how the global worker
631 * pools are managed. Unless noted otherwise, these functions assume that 640 * pools are managed. Unless noted otherwise, these functions assume that
632 * they're being called with gcwq->lock held. 641 * they're being called with pool->lock held.
633 */ 642 */
634 643
635static bool __need_more_worker(struct worker_pool *pool) 644static bool __need_more_worker(struct worker_pool *pool)
636{ 645{
637 return !atomic_read(get_pool_nr_running(pool)); 646 return !atomic_read(&pool->nr_running);
638} 647}
639 648
640/* 649/*
@@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool)
642 * running workers. 651 * running workers.
643 * 652 *
644 * Note that, because unbound workers never contribute to nr_running, this 653 * Note that, because unbound workers never contribute to nr_running, this
645 * function will always return %true for unbound gcwq as long as the 654 * function will always return %true for unbound pools as long as the
646 * worklist isn't empty. 655 * worklist isn't empty.
647 */ 656 */
648static bool need_more_worker(struct worker_pool *pool) 657static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool)
659/* Do I need to keep working? Called from currently running workers. */ 668/* Do I need to keep working? Called from currently running workers. */
660static bool keep_working(struct worker_pool *pool) 669static bool keep_working(struct worker_pool *pool)
661{ 670{
662 atomic_t *nr_running = get_pool_nr_running(pool); 671 return !list_empty(&pool->worklist) &&
663 672 atomic_read(&pool->nr_running) <= 1;
664 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
665} 673}
666 674
667/* Do we need a new worker? Called from manager. */ 675/* Do we need a new worker? Called from manager. */
@@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool)
714 * Wake up the first idle worker of @pool. 722 * Wake up the first idle worker of @pool.
715 * 723 *
716 * CONTEXT: 724 * CONTEXT:
717 * spin_lock_irq(gcwq->lock). 725 * spin_lock_irq(pool->lock).
718 */ 726 */
719static void wake_up_worker(struct worker_pool *pool) 727static void wake_up_worker(struct worker_pool *pool)
720{ 728{
@@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
740 struct worker *worker = kthread_data(task); 748 struct worker *worker = kthread_data(task);
741 749
742 if (!(worker->flags & WORKER_NOT_RUNNING)) { 750 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); 751 WARN_ON_ONCE(worker->pool->cpu != cpu);
744 atomic_inc(get_pool_nr_running(worker->pool)); 752 atomic_inc(&worker->pool->nr_running);
745 } 753 }
746} 754}
747 755
@@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
764 unsigned int cpu) 772 unsigned int cpu)
765{ 773{
766 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 774 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
767 struct worker_pool *pool = worker->pool; 775 struct worker_pool *pool;
768 atomic_t *nr_running = get_pool_nr_running(pool);
769 776
777 /*
778 * Rescuers, which may not have all the fields set up like normal
779 * workers, also reach here, let's not access anything before
780 * checking NOT_RUNNING.
781 */
770 if (worker->flags & WORKER_NOT_RUNNING) 782 if (worker->flags & WORKER_NOT_RUNNING)
771 return NULL; 783 return NULL;
772 784
785 pool = worker->pool;
786
773 /* this can only happen on the local cpu */ 787 /* this can only happen on the local cpu */
774 BUG_ON(cpu != raw_smp_processor_id()); 788 BUG_ON(cpu != raw_smp_processor_id());
775 789
@@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
781 * NOT_RUNNING is clear. This means that we're bound to and 795 * NOT_RUNNING is clear. This means that we're bound to and
782 * running on the local cpu w/ rq lock held and preemption 796 * running on the local cpu w/ rq lock held and preemption
783 * disabled, which in turn means that none else could be 797 * disabled, which in turn means that none else could be
784 * manipulating idle_list, so dereferencing idle_list without gcwq 798 * manipulating idle_list, so dereferencing idle_list without pool
785 * lock is safe. 799 * lock is safe.
786 */ 800 */
787 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 801 if (atomic_dec_and_test(&pool->nr_running) &&
802 !list_empty(&pool->worklist))
788 to_wakeup = first_worker(pool); 803 to_wakeup = first_worker(pool);
789 return to_wakeup ? to_wakeup->task : NULL; 804 return to_wakeup ? to_wakeup->task : NULL;
790} 805}
@@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
800 * woken up. 815 * woken up.
801 * 816 *
802 * CONTEXT: 817 * CONTEXT:
803 * spin_lock_irq(gcwq->lock) 818 * spin_lock_irq(pool->lock)
804 */ 819 */
805static inline void worker_set_flags(struct worker *worker, unsigned int flags, 820static inline void worker_set_flags(struct worker *worker, unsigned int flags,
806 bool wakeup) 821 bool wakeup)
@@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
816 */ 831 */
817 if ((flags & WORKER_NOT_RUNNING) && 832 if ((flags & WORKER_NOT_RUNNING) &&
818 !(worker->flags & WORKER_NOT_RUNNING)) { 833 !(worker->flags & WORKER_NOT_RUNNING)) {
819 atomic_t *nr_running = get_pool_nr_running(pool);
820
821 if (wakeup) { 834 if (wakeup) {
822 if (atomic_dec_and_test(nr_running) && 835 if (atomic_dec_and_test(&pool->nr_running) &&
823 !list_empty(&pool->worklist)) 836 !list_empty(&pool->worklist))
824 wake_up_worker(pool); 837 wake_up_worker(pool);
825 } else 838 } else
826 atomic_dec(nr_running); 839 atomic_dec(&pool->nr_running);
827 } 840 }
828 841
829 worker->flags |= flags; 842 worker->flags |= flags;
@@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
837 * Clear @flags in @worker->flags and adjust nr_running accordingly. 850 * Clear @flags in @worker->flags and adjust nr_running accordingly.
838 * 851 *
839 * CONTEXT: 852 * CONTEXT:
840 * spin_lock_irq(gcwq->lock) 853 * spin_lock_irq(pool->lock)
841 */ 854 */
842static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 855static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
843{ 856{
@@ -855,87 +868,56 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
855 */ 868 */
856 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 869 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
857 if (!(worker->flags & WORKER_NOT_RUNNING)) 870 if (!(worker->flags & WORKER_NOT_RUNNING))
858 atomic_inc(get_pool_nr_running(pool)); 871 atomic_inc(&pool->nr_running);
859} 872}
860 873
861/** 874/**
862 * busy_worker_head - return the busy hash head for a work 875 * find_worker_executing_work - find worker which is executing a work
863 * @gcwq: gcwq of interest 876 * @pool: pool of interest
864 * @work: work to be hashed
865 *
866 * Return hash head of @gcwq for @work.
867 *
868 * CONTEXT:
869 * spin_lock_irq(gcwq->lock).
870 *
871 * RETURNS:
872 * Pointer to the hash head.
873 */
874static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
875 struct work_struct *work)
876{
877 const int base_shift = ilog2(sizeof(struct work_struct));
878 unsigned long v = (unsigned long)work;
879
880 /* simple shift and fold hash, do we need something better? */
881 v >>= base_shift;
882 v += v >> BUSY_WORKER_HASH_ORDER;
883 v &= BUSY_WORKER_HASH_MASK;
884
885 return &gcwq->busy_hash[v];
886}
887
888/**
889 * __find_worker_executing_work - find worker which is executing a work
890 * @gcwq: gcwq of interest
891 * @bwh: hash head as returned by busy_worker_head()
892 * @work: work to find worker for 877 * @work: work to find worker for
893 * 878 *
894 * Find a worker which is executing @work on @gcwq. @bwh should be 879 * Find a worker which is executing @work on @pool by searching
895 * the hash head obtained by calling busy_worker_head() with the same 880 * @pool->busy_hash which is keyed by the address of @work. For a worker
896 * work. 881 * to match, its current execution should match the address of @work and
882 * its work function. This is to avoid unwanted dependency between
883 * unrelated work executions through a work item being recycled while still
884 * being executed.
885 *
886 * This is a bit tricky. A work item may be freed once its execution
887 * starts and nothing prevents the freed area from being recycled for
888 * another work item. If the same work item address ends up being reused
889 * before the original execution finishes, workqueue will identify the
890 * recycled work item as currently executing and make it wait until the
891 * current execution finishes, introducing an unwanted dependency.
892 *
893 * This function checks the work item address, work function and workqueue
894 * to avoid false positives. Note that this isn't complete as one may
895 * construct a work function which can introduce dependency onto itself
896 * through a recycled work item. Well, if somebody wants to shoot oneself
897 * in the foot that badly, there's only so much we can do, and if such
898 * deadlock actually occurs, it should be easy to locate the culprit work
899 * function.
897 * 900 *
898 * CONTEXT: 901 * CONTEXT:
899 * spin_lock_irq(gcwq->lock). 902 * spin_lock_irq(pool->lock).
900 * 903 *
901 * RETURNS: 904 * RETURNS:
902 * Pointer to worker which is executing @work if found, NULL 905 * Pointer to worker which is executing @work if found, NULL
903 * otherwise. 906 * otherwise.
904 */ 907 */
905static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, 908static struct worker *find_worker_executing_work(struct worker_pool *pool,
906 struct hlist_head *bwh, 909 struct work_struct *work)
907 struct work_struct *work)
908{ 910{
909 struct worker *worker; 911 struct worker *worker;
910 struct hlist_node *tmp; 912 struct hlist_node *tmp;
911 913
912 hlist_for_each_entry(worker, tmp, bwh, hentry) 914 hash_for_each_possible(pool->busy_hash, worker, tmp, hentry,
913 if (worker->current_work == work) 915 (unsigned long)work)
916 if (worker->current_work == work &&
917 worker->current_func == work->func)
914 return worker; 918 return worker;
915 return NULL;
916}
917 919
918/** 920 return NULL;
919 * find_worker_executing_work - find worker which is executing a work
920 * @gcwq: gcwq of interest
921 * @work: work to find worker for
922 *
923 * Find a worker which is executing @work on @gcwq. This function is
924 * identical to __find_worker_executing_work() except that this
925 * function calculates @bwh itself.
926 *
927 * CONTEXT:
928 * spin_lock_irq(gcwq->lock).
929 *
930 * RETURNS:
931 * Pointer to worker which is executing @work if found, NULL
932 * otherwise.
933 */
934static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
935 struct work_struct *work)
936{
937 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
938 work);
939} 921}
940 922
941/** 923/**
@@ -953,7 +935,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
953 * nested inside outer list_for_each_entry_safe(). 935 * nested inside outer list_for_each_entry_safe().
954 * 936 *
955 * CONTEXT: 937 * CONTEXT:
956 * spin_lock_irq(gcwq->lock). 938 * spin_lock_irq(pool->lock).
957 */ 939 */
958static void move_linked_works(struct work_struct *work, struct list_head *head, 940static void move_linked_works(struct work_struct *work, struct list_head *head,
959 struct work_struct **nextp) 941 struct work_struct **nextp)
@@ -979,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
979 *nextp = n; 961 *nextp = n;
980} 962}
981 963
982static void cwq_activate_delayed_work(struct work_struct *work) 964static void pwq_activate_delayed_work(struct work_struct *work)
983{ 965{
984 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 966 struct pool_workqueue *pwq = get_work_pwq(work);
985 967
986 trace_workqueue_activate_work(work); 968 trace_workqueue_activate_work(work);
987 move_linked_works(work, &cwq->pool->worklist, NULL); 969 move_linked_works(work, &pwq->pool->worklist, NULL);
988 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 970 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
989 cwq->nr_active++; 971 pwq->nr_active++;
990} 972}
991 973
992static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) 974static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
993{ 975{
994 struct work_struct *work = list_first_entry(&cwq->delayed_works, 976 struct work_struct *work = list_first_entry(&pwq->delayed_works,
995 struct work_struct, entry); 977 struct work_struct, entry);
996 978
997 cwq_activate_delayed_work(work); 979 pwq_activate_delayed_work(work);
998} 980}
999 981
1000/** 982/**
1001 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight 983 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1002 * @cwq: cwq of interest 984 * @pwq: pwq of interest
1003 * @color: color of work which left the queue 985 * @color: color of work which left the queue
1004 * 986 *
1005 * A work either has completed or is removed from pending queue, 987 * A work either has completed or is removed from pending queue,
1006 * decrement nr_in_flight of its cwq and handle workqueue flushing. 988 * decrement nr_in_flight of its pwq and handle workqueue flushing.
1007 * 989 *
1008 * CONTEXT: 990 * CONTEXT:
1009 * spin_lock_irq(gcwq->lock). 991 * spin_lock_irq(pool->lock).
1010 */ 992 */
1011static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) 993static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1012{ 994{
1013 /* ignore uncolored works */ 995 /* ignore uncolored works */
1014 if (color == WORK_NO_COLOR) 996 if (color == WORK_NO_COLOR)
1015 return; 997 return;
1016 998
1017 cwq->nr_in_flight[color]--; 999 pwq->nr_in_flight[color]--;
1018 1000
1019 cwq->nr_active--; 1001 pwq->nr_active--;
1020 if (!list_empty(&cwq->delayed_works)) { 1002 if (!list_empty(&pwq->delayed_works)) {
1021 /* one down, submit a delayed one */ 1003 /* one down, submit a delayed one */
1022 if (cwq->nr_active < cwq->max_active) 1004 if (pwq->nr_active < pwq->max_active)
1023 cwq_activate_first_delayed(cwq); 1005 pwq_activate_first_delayed(pwq);
1024 } 1006 }
1025 1007
1026 /* is flush in progress and are we at the flushing tip? */ 1008 /* is flush in progress and are we at the flushing tip? */
1027 if (likely(cwq->flush_color != color)) 1009 if (likely(pwq->flush_color != color))
1028 return; 1010 return;
1029 1011
1030 /* are there still in-flight works? */ 1012 /* are there still in-flight works? */
1031 if (cwq->nr_in_flight[color]) 1013 if (pwq->nr_in_flight[color])
1032 return; 1014 return;
1033 1015
1034 /* this cwq is done, clear flush_color */ 1016 /* this pwq is done, clear flush_color */
1035 cwq->flush_color = -1; 1017 pwq->flush_color = -1;
1036 1018
1037 /* 1019 /*
1038 * If this was the last cwq, wake up the first flusher. It 1020 * If this was the last pwq, wake up the first flusher. It
1039 * will handle the rest. 1021 * will handle the rest.
1040 */ 1022 */
1041 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) 1023 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1042 complete(&cwq->wq->first_flusher->done); 1024 complete(&pwq->wq->first_flusher->done);
1043} 1025}
1044 1026
1045/** 1027/**
@@ -1070,7 +1052,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1070static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 1052static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1071 unsigned long *flags) 1053 unsigned long *flags)
1072{ 1054{
1073 struct global_cwq *gcwq; 1055 struct worker_pool *pool;
1056 struct pool_workqueue *pwq;
1074 1057
1075 local_irq_save(*flags); 1058 local_irq_save(*flags);
1076 1059
@@ -1095,41 +1078,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1095 * The queueing is in progress, or it is already queued. Try to 1078 * The queueing is in progress, or it is already queued. Try to
1096 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 1079 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1097 */ 1080 */
1098 gcwq = get_work_gcwq(work); 1081 pool = get_work_pool(work);
1099 if (!gcwq) 1082 if (!pool)
1100 goto fail; 1083 goto fail;
1101 1084
1102 spin_lock(&gcwq->lock); 1085 spin_lock(&pool->lock);
1103 if (!list_empty(&work->entry)) { 1086 /*
1087 * work->data is guaranteed to point to pwq only while the work
1088 * item is queued on pwq->wq, and both updating work->data to point
1089 * to pwq on queueing and to pool on dequeueing are done under
1090 * pwq->pool->lock. This in turn guarantees that, if work->data
1091 * points to pwq which is associated with a locked pool, the work
1092 * item is currently queued on that pool.
1093 */
1094 pwq = get_work_pwq(work);
1095 if (pwq && pwq->pool == pool) {
1096 debug_work_deactivate(work);
1097
1104 /* 1098 /*
1105 * This work is queued, but perhaps we locked the wrong gcwq. 1099 * A delayed work item cannot be grabbed directly because
1106 * In that case we must see the new value after rmb(), see 1100 * it might have linked NO_COLOR work items which, if left
1107 * insert_work()->wmb(). 1101 * on the delayed_list, will confuse pwq->nr_active
1102 * management later on and cause stall. Make sure the work
1103 * item is activated before grabbing.
1108 */ 1104 */
1109 smp_rmb(); 1105 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1110 if (gcwq == get_work_gcwq(work)) { 1106 pwq_activate_delayed_work(work);
1111 debug_work_deactivate(work);
1112 1107
1113 /* 1108 list_del_init(&work->entry);
1114 * A delayed work item cannot be grabbed directly 1109 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
1115 * because it might have linked NO_COLOR work items
1116 * which, if left on the delayed_list, will confuse
1117 * cwq->nr_active management later on and cause
1118 * stall. Make sure the work item is activated
1119 * before grabbing.
1120 */
1121 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1122 cwq_activate_delayed_work(work);
1123 1110
1124 list_del_init(&work->entry); 1111 /* work->data points to pwq iff queued, point to pool */
1125 cwq_dec_nr_in_flight(get_work_cwq(work), 1112 set_work_pool_and_keep_pending(work, pool->id);
1126 get_work_color(work));
1127 1113
1128 spin_unlock(&gcwq->lock); 1114 spin_unlock(&pool->lock);
1129 return 1; 1115 return 1;
1130 }
1131 } 1116 }
1132 spin_unlock(&gcwq->lock); 1117 spin_unlock(&pool->lock);
1133fail: 1118fail:
1134 local_irq_restore(*flags); 1119 local_irq_restore(*flags);
1135 if (work_is_canceling(work)) 1120 if (work_is_canceling(work))
@@ -1139,33 +1124,25 @@ fail:
1139} 1124}
1140 1125
1141/** 1126/**
1142 * insert_work - insert a work into gcwq 1127 * insert_work - insert a work into a pool
1143 * @cwq: cwq @work belongs to 1128 * @pwq: pwq @work belongs to
1144 * @work: work to insert 1129 * @work: work to insert
1145 * @head: insertion point 1130 * @head: insertion point
1146 * @extra_flags: extra WORK_STRUCT_* flags to set 1131 * @extra_flags: extra WORK_STRUCT_* flags to set
1147 * 1132 *
1148 * Insert @work which belongs to @cwq into @gcwq after @head. 1133 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
1149 * @extra_flags is or'd to work_struct flags. 1134 * work_struct flags.
1150 * 1135 *
1151 * CONTEXT: 1136 * CONTEXT:
1152 * spin_lock_irq(gcwq->lock). 1137 * spin_lock_irq(pool->lock).
1153 */ 1138 */
1154static void insert_work(struct cpu_workqueue_struct *cwq, 1139static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1155 struct work_struct *work, struct list_head *head, 1140 struct list_head *head, unsigned int extra_flags)
1156 unsigned int extra_flags)
1157{ 1141{
1158 struct worker_pool *pool = cwq->pool; 1142 struct worker_pool *pool = pwq->pool;
1159 1143
1160 /* we own @work, set data and link */ 1144 /* we own @work, set data and link */
1161 set_work_cwq(work, cwq, extra_flags); 1145 set_work_pwq(work, pwq, extra_flags);
1162
1163 /*
1164 * Ensure that we get the right work->data if we see the
1165 * result of list_add() below, see try_to_grab_pending().
1166 */
1167 smp_wmb();
1168
1169 list_add_tail(&work->entry, head); 1146 list_add_tail(&work->entry, head);
1170 1147
1171 /* 1148 /*
@@ -1181,41 +1158,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
1181 1158
1182/* 1159/*
1183 * Test whether @work is being queued from another work executing on the 1160 * Test whether @work is being queued from another work executing on the
1184 * same workqueue. This is rather expensive and should only be used from 1161 * same workqueue.
1185 * cold paths.
1186 */ 1162 */
1187static bool is_chained_work(struct workqueue_struct *wq) 1163static bool is_chained_work(struct workqueue_struct *wq)
1188{ 1164{
1189 unsigned long flags; 1165 struct worker *worker;
1190 unsigned int cpu;
1191
1192 for_each_gcwq_cpu(cpu) {
1193 struct global_cwq *gcwq = get_gcwq(cpu);
1194 struct worker *worker;
1195 struct hlist_node *pos;
1196 int i;
1197 1166
1198 spin_lock_irqsave(&gcwq->lock, flags); 1167 worker = current_wq_worker();
1199 for_each_busy_worker(worker, i, pos, gcwq) { 1168 /*
1200 if (worker->task != current) 1169 * Return %true iff I'm a worker execuing a work item on @wq. If
1201 continue; 1170 * I'm @worker, it's safe to dereference it without locking.
1202 spin_unlock_irqrestore(&gcwq->lock, flags); 1171 */
1203 /* 1172 return worker && worker->current_pwq->wq == wq;
1204 * I'm @worker, no locking necessary. See if @work
1205 * is headed to the same workqueue.
1206 */
1207 return worker->current_cwq->wq == wq;
1208 }
1209 spin_unlock_irqrestore(&gcwq->lock, flags);
1210 }
1211 return false;
1212} 1173}
1213 1174
1214static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1175static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1215 struct work_struct *work) 1176 struct work_struct *work)
1216{ 1177{
1217 struct global_cwq *gcwq; 1178 struct pool_workqueue *pwq;
1218 struct cpu_workqueue_struct *cwq;
1219 struct list_head *worklist; 1179 struct list_head *worklist;
1220 unsigned int work_flags; 1180 unsigned int work_flags;
1221 unsigned int req_cpu = cpu; 1181 unsigned int req_cpu = cpu;
@@ -1235,9 +1195,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1235 WARN_ON_ONCE(!is_chained_work(wq))) 1195 WARN_ON_ONCE(!is_chained_work(wq)))
1236 return; 1196 return;
1237 1197
1238 /* determine gcwq to use */ 1198 /* determine the pwq to use */
1239 if (!(wq->flags & WQ_UNBOUND)) { 1199 if (!(wq->flags & WQ_UNBOUND)) {
1240 struct global_cwq *last_gcwq; 1200 struct worker_pool *last_pool;
1241 1201
1242 if (cpu == WORK_CPU_UNBOUND) 1202 if (cpu == WORK_CPU_UNBOUND)
1243 cpu = raw_smp_processor_id(); 1203 cpu = raw_smp_processor_id();
@@ -1248,55 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1248 * work needs to be queued on that cpu to guarantee 1208 * work needs to be queued on that cpu to guarantee
1249 * non-reentrancy. 1209 * non-reentrancy.
1250 */ 1210 */
1251 gcwq = get_gcwq(cpu); 1211 pwq = get_pwq(cpu, wq);
1252 last_gcwq = get_work_gcwq(work); 1212 last_pool = get_work_pool(work);
1253 1213
1254 if (last_gcwq && last_gcwq != gcwq) { 1214 if (last_pool && last_pool != pwq->pool) {
1255 struct worker *worker; 1215 struct worker *worker;
1256 1216
1257 spin_lock(&last_gcwq->lock); 1217 spin_lock(&last_pool->lock);
1258 1218
1259 worker = find_worker_executing_work(last_gcwq, work); 1219 worker = find_worker_executing_work(last_pool, work);
1260 1220
1261 if (worker && worker->current_cwq->wq == wq) 1221 if (worker && worker->current_pwq->wq == wq) {
1262 gcwq = last_gcwq; 1222 pwq = get_pwq(last_pool->cpu, wq);
1263 else { 1223 } else {
1264 /* meh... not running there, queue here */ 1224 /* meh... not running there, queue here */
1265 spin_unlock(&last_gcwq->lock); 1225 spin_unlock(&last_pool->lock);
1266 spin_lock(&gcwq->lock); 1226 spin_lock(&pwq->pool->lock);
1267 } 1227 }
1268 } else { 1228 } else {
1269 spin_lock(&gcwq->lock); 1229 spin_lock(&pwq->pool->lock);
1270 } 1230 }
1271 } else { 1231 } else {
1272 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1232 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1273 spin_lock(&gcwq->lock); 1233 spin_lock(&pwq->pool->lock);
1274 } 1234 }
1275 1235
1276 /* gcwq determined, get cwq and queue */ 1236 /* pwq determined, queue */
1277 cwq = get_cwq(gcwq->cpu, wq); 1237 trace_workqueue_queue_work(req_cpu, pwq, work);
1278 trace_workqueue_queue_work(req_cpu, cwq, work);
1279 1238
1280 if (WARN_ON(!list_empty(&work->entry))) { 1239 if (WARN_ON(!list_empty(&work->entry))) {
1281 spin_unlock(&gcwq->lock); 1240 spin_unlock(&pwq->pool->lock);
1282 return; 1241 return;
1283 } 1242 }
1284 1243
1285 cwq->nr_in_flight[cwq->work_color]++; 1244 pwq->nr_in_flight[pwq->work_color]++;
1286 work_flags = work_color_to_flags(cwq->work_color); 1245 work_flags = work_color_to_flags(pwq->work_color);
1287 1246
1288 if (likely(cwq->nr_active < cwq->max_active)) { 1247 if (likely(pwq->nr_active < pwq->max_active)) {
1289 trace_workqueue_activate_work(work); 1248 trace_workqueue_activate_work(work);
1290 cwq->nr_active++; 1249 pwq->nr_active++;
1291 worklist = &cwq->pool->worklist; 1250 worklist = &pwq->pool->worklist;
1292 } else { 1251 } else {
1293 work_flags |= WORK_STRUCT_DELAYED; 1252 work_flags |= WORK_STRUCT_DELAYED;
1294 worklist = &cwq->delayed_works; 1253 worklist = &pwq->delayed_works;
1295 } 1254 }
1296 1255
1297 insert_work(cwq, work, worklist, work_flags); 1256 insert_work(pwq, work, worklist, work_flags);
1298 1257
1299 spin_unlock(&gcwq->lock); 1258 spin_unlock(&pwq->pool->lock);
1300} 1259}
1301 1260
1302/** 1261/**
@@ -1347,19 +1306,17 @@ EXPORT_SYMBOL_GPL(queue_work);
1347void delayed_work_timer_fn(unsigned long __data) 1306void delayed_work_timer_fn(unsigned long __data)
1348{ 1307{
1349 struct delayed_work *dwork = (struct delayed_work *)__data; 1308 struct delayed_work *dwork = (struct delayed_work *)__data;
1350 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1351 1309
1352 /* should have been called from irqsafe timer with irq already off */ 1310 /* should have been called from irqsafe timer with irq already off */
1353 __queue_work(dwork->cpu, cwq->wq, &dwork->work); 1311 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1354} 1312}
1355EXPORT_SYMBOL_GPL(delayed_work_timer_fn); 1313EXPORT_SYMBOL(delayed_work_timer_fn);
1356 1314
1357static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 1315static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1358 struct delayed_work *dwork, unsigned long delay) 1316 struct delayed_work *dwork, unsigned long delay)
1359{ 1317{
1360 struct timer_list *timer = &dwork->timer; 1318 struct timer_list *timer = &dwork->timer;
1361 struct work_struct *work = &dwork->work; 1319 struct work_struct *work = &dwork->work;
1362 unsigned int lcpu;
1363 1320
1364 WARN_ON_ONCE(timer->function != delayed_work_timer_fn || 1321 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1365 timer->data != (unsigned long)dwork); 1322 timer->data != (unsigned long)dwork);
@@ -1379,30 +1336,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1379 1336
1380 timer_stats_timer_set_start_info(&dwork->timer); 1337 timer_stats_timer_set_start_info(&dwork->timer);
1381 1338
1382 /* 1339 dwork->wq = wq;
1383 * This stores cwq for the moment, for the timer_fn. Note that the
1384 * work's gcwq is preserved to allow reentrance detection for
1385 * delayed works.
1386 */
1387 if (!(wq->flags & WQ_UNBOUND)) {
1388 struct global_cwq *gcwq = get_work_gcwq(work);
1389
1390 /*
1391 * If we cannot get the last gcwq from @work directly,
1392 * select the last CPU such that it avoids unnecessarily
1393 * triggering non-reentrancy check in __queue_work().
1394 */
1395 lcpu = cpu;
1396 if (gcwq)
1397 lcpu = gcwq->cpu;
1398 if (lcpu == WORK_CPU_UNBOUND)
1399 lcpu = raw_smp_processor_id();
1400 } else {
1401 lcpu = WORK_CPU_UNBOUND;
1402 }
1403
1404 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1405
1406 dwork->cpu = cpu; 1340 dwork->cpu = cpu;
1407 timer->expires = jiffies + delay; 1341 timer->expires = jiffies + delay;
1408 1342
@@ -1519,12 +1453,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
1519 * necessary. 1453 * necessary.
1520 * 1454 *
1521 * LOCKING: 1455 * LOCKING:
1522 * spin_lock_irq(gcwq->lock). 1456 * spin_lock_irq(pool->lock).
1523 */ 1457 */
1524static void worker_enter_idle(struct worker *worker) 1458static void worker_enter_idle(struct worker *worker)
1525{ 1459{
1526 struct worker_pool *pool = worker->pool; 1460 struct worker_pool *pool = worker->pool;
1527 struct global_cwq *gcwq = pool->gcwq;
1528 1461
1529 BUG_ON(worker->flags & WORKER_IDLE); 1462 BUG_ON(worker->flags & WORKER_IDLE);
1530 BUG_ON(!list_empty(&worker->entry) && 1463 BUG_ON(!list_empty(&worker->entry) &&
@@ -1542,14 +1475,14 @@ static void worker_enter_idle(struct worker *worker)
1542 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1475 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1543 1476
1544 /* 1477 /*
1545 * Sanity check nr_running. Because gcwq_unbind_fn() releases 1478 * Sanity check nr_running. Because wq_unbind_fn() releases
1546 * gcwq->lock between setting %WORKER_UNBOUND and zapping 1479 * pool->lock between setting %WORKER_UNBOUND and zapping
1547 * nr_running, the warning may trigger spuriously. Check iff 1480 * nr_running, the warning may trigger spuriously. Check iff
1548 * unbind is not in progress. 1481 * unbind is not in progress.
1549 */ 1482 */
1550 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && 1483 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1551 pool->nr_workers == pool->nr_idle && 1484 pool->nr_workers == pool->nr_idle &&
1552 atomic_read(get_pool_nr_running(pool))); 1485 atomic_read(&pool->nr_running));
1553} 1486}
1554 1487
1555/** 1488/**
@@ -1559,7 +1492,7 @@ static void worker_enter_idle(struct worker *worker)
1559 * @worker is leaving idle state. Update stats. 1492 * @worker is leaving idle state. Update stats.
1560 * 1493 *
1561 * LOCKING: 1494 * LOCKING:
1562 * spin_lock_irq(gcwq->lock). 1495 * spin_lock_irq(pool->lock).
1563 */ 1496 */
1564static void worker_leave_idle(struct worker *worker) 1497static void worker_leave_idle(struct worker *worker)
1565{ 1498{
@@ -1572,7 +1505,7 @@ static void worker_leave_idle(struct worker *worker)
1572} 1505}
1573 1506
1574/** 1507/**
1575 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq 1508 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
1576 * @worker: self 1509 * @worker: self
1577 * 1510 *
1578 * Works which are scheduled while the cpu is online must at least be 1511 * Works which are scheduled while the cpu is online must at least be
@@ -1584,27 +1517,27 @@ static void worker_leave_idle(struct worker *worker)
1584 * themselves to the target cpu and may race with cpu going down or 1517 * themselves to the target cpu and may race with cpu going down or
1585 * coming online. kthread_bind() can't be used because it may put the 1518 * coming online. kthread_bind() can't be used because it may put the
1586 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1519 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1587 * verbatim as it's best effort and blocking and gcwq may be 1520 * verbatim as it's best effort and blocking and pool may be
1588 * [dis]associated in the meantime. 1521 * [dis]associated in the meantime.
1589 * 1522 *
1590 * This function tries set_cpus_allowed() and locks gcwq and verifies the 1523 * This function tries set_cpus_allowed() and locks pool and verifies the
1591 * binding against %GCWQ_DISASSOCIATED which is set during 1524 * binding against %POOL_DISASSOCIATED which is set during
1592 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker 1525 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1593 * enters idle state or fetches works without dropping lock, it can 1526 * enters idle state or fetches works without dropping lock, it can
1594 * guarantee the scheduling requirement described in the first paragraph. 1527 * guarantee the scheduling requirement described in the first paragraph.
1595 * 1528 *
1596 * CONTEXT: 1529 * CONTEXT:
1597 * Might sleep. Called without any lock but returns with gcwq->lock 1530 * Might sleep. Called without any lock but returns with pool->lock
1598 * held. 1531 * held.
1599 * 1532 *
1600 * RETURNS: 1533 * RETURNS:
1601 * %true if the associated gcwq is online (@worker is successfully 1534 * %true if the associated pool is online (@worker is successfully
1602 * bound), %false if offline. 1535 * bound), %false if offline.
1603 */ 1536 */
1604static bool worker_maybe_bind_and_lock(struct worker *worker) 1537static bool worker_maybe_bind_and_lock(struct worker *worker)
1605__acquires(&gcwq->lock) 1538__acquires(&pool->lock)
1606{ 1539{
1607 struct global_cwq *gcwq = worker->pool->gcwq; 1540 struct worker_pool *pool = worker->pool;
1608 struct task_struct *task = worker->task; 1541 struct task_struct *task = worker->task;
1609 1542
1610 while (true) { 1543 while (true) {
@@ -1612,19 +1545,19 @@ __acquires(&gcwq->lock)
1612 * The following call may fail, succeed or succeed 1545 * The following call may fail, succeed or succeed
1613 * without actually migrating the task to the cpu if 1546 * without actually migrating the task to the cpu if
1614 * it races with cpu hotunplug operation. Verify 1547 * it races with cpu hotunplug operation. Verify
1615 * against GCWQ_DISASSOCIATED. 1548 * against POOL_DISASSOCIATED.
1616 */ 1549 */
1617 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) 1550 if (!(pool->flags & POOL_DISASSOCIATED))
1618 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); 1551 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
1619 1552
1620 spin_lock_irq(&gcwq->lock); 1553 spin_lock_irq(&pool->lock);
1621 if (gcwq->flags & GCWQ_DISASSOCIATED) 1554 if (pool->flags & POOL_DISASSOCIATED)
1622 return false; 1555 return false;
1623 if (task_cpu(task) == gcwq->cpu && 1556 if (task_cpu(task) == pool->cpu &&
1624 cpumask_equal(&current->cpus_allowed, 1557 cpumask_equal(&current->cpus_allowed,
1625 get_cpu_mask(gcwq->cpu))) 1558 get_cpu_mask(pool->cpu)))
1626 return true; 1559 return true;
1627 spin_unlock_irq(&gcwq->lock); 1560 spin_unlock_irq(&pool->lock);
1628 1561
1629 /* 1562 /*
1630 * We've raced with CPU hot[un]plug. Give it a breather 1563 * We've raced with CPU hot[un]plug. Give it a breather
@@ -1643,15 +1576,13 @@ __acquires(&gcwq->lock)
1643 */ 1576 */
1644static void idle_worker_rebind(struct worker *worker) 1577static void idle_worker_rebind(struct worker *worker)
1645{ 1578{
1646 struct global_cwq *gcwq = worker->pool->gcwq;
1647
1648 /* CPU may go down again inbetween, clear UNBOUND only on success */ 1579 /* CPU may go down again inbetween, clear UNBOUND only on success */
1649 if (worker_maybe_bind_and_lock(worker)) 1580 if (worker_maybe_bind_and_lock(worker))
1650 worker_clr_flags(worker, WORKER_UNBOUND); 1581 worker_clr_flags(worker, WORKER_UNBOUND);
1651 1582
1652 /* rebind complete, become available again */ 1583 /* rebind complete, become available again */
1653 list_add(&worker->entry, &worker->pool->idle_list); 1584 list_add(&worker->entry, &worker->pool->idle_list);
1654 spin_unlock_irq(&gcwq->lock); 1585 spin_unlock_irq(&worker->pool->lock);
1655} 1586}
1656 1587
1657/* 1588/*
@@ -1663,19 +1594,18 @@ static void idle_worker_rebind(struct worker *worker)
1663static void busy_worker_rebind_fn(struct work_struct *work) 1594static void busy_worker_rebind_fn(struct work_struct *work)
1664{ 1595{
1665 struct worker *worker = container_of(work, struct worker, rebind_work); 1596 struct worker *worker = container_of(work, struct worker, rebind_work);
1666 struct global_cwq *gcwq = worker->pool->gcwq;
1667 1597
1668 if (worker_maybe_bind_and_lock(worker)) 1598 if (worker_maybe_bind_and_lock(worker))
1669 worker_clr_flags(worker, WORKER_UNBOUND); 1599 worker_clr_flags(worker, WORKER_UNBOUND);
1670 1600
1671 spin_unlock_irq(&gcwq->lock); 1601 spin_unlock_irq(&worker->pool->lock);
1672} 1602}
1673 1603
1674/** 1604/**
1675 * rebind_workers - rebind all workers of a gcwq to the associated CPU 1605 * rebind_workers - rebind all workers of a pool to the associated CPU
1676 * @gcwq: gcwq of interest 1606 * @pool: pool of interest
1677 * 1607 *
1678 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding 1608 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1679 * is different for idle and busy ones. 1609 * is different for idle and busy ones.
1680 * 1610 *
1681 * Idle ones will be removed from the idle_list and woken up. They will 1611 * Idle ones will be removed from the idle_list and woken up. They will
@@ -1693,38 +1623,32 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1693 * including the manager will not appear on @idle_list until rebind is 1623 * including the manager will not appear on @idle_list until rebind is
1694 * complete, making local wake-ups safe. 1624 * complete, making local wake-ups safe.
1695 */ 1625 */
1696static void rebind_workers(struct global_cwq *gcwq) 1626static void rebind_workers(struct worker_pool *pool)
1697{ 1627{
1698 struct worker_pool *pool;
1699 struct worker *worker, *n; 1628 struct worker *worker, *n;
1700 struct hlist_node *pos; 1629 struct hlist_node *pos;
1701 int i; 1630 int i;
1702 1631
1703 lockdep_assert_held(&gcwq->lock); 1632 lockdep_assert_held(&pool->assoc_mutex);
1704 1633 lockdep_assert_held(&pool->lock);
1705 for_each_worker_pool(pool, gcwq)
1706 lockdep_assert_held(&pool->assoc_mutex);
1707 1634
1708 /* dequeue and kick idle ones */ 1635 /* dequeue and kick idle ones */
1709 for_each_worker_pool(pool, gcwq) { 1636 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1710 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { 1637 /*
1711 /* 1638 * idle workers should be off @pool->idle_list until rebind
1712 * idle workers should be off @pool->idle_list 1639 * is complete to avoid receiving premature local wake-ups.
1713 * until rebind is complete to avoid receiving 1640 */
1714 * premature local wake-ups. 1641 list_del_init(&worker->entry);
1715 */
1716 list_del_init(&worker->entry);
1717 1642
1718 /* 1643 /*
1719 * worker_thread() will see the above dequeuing 1644 * worker_thread() will see the above dequeuing and call
1720 * and call idle_worker_rebind(). 1645 * idle_worker_rebind().
1721 */ 1646 */
1722 wake_up_process(worker->task); 1647 wake_up_process(worker->task);
1723 }
1724 } 1648 }
1725 1649
1726 /* rebind busy workers */ 1650 /* rebind busy workers */
1727 for_each_busy_worker(worker, i, pos, gcwq) { 1651 for_each_busy_worker(worker, i, pos, pool) {
1728 struct work_struct *rebind_work = &worker->rebind_work; 1652 struct work_struct *rebind_work = &worker->rebind_work;
1729 struct workqueue_struct *wq; 1653 struct workqueue_struct *wq;
1730 1654
@@ -1736,16 +1660,16 @@ static void rebind_workers(struct global_cwq *gcwq)
1736 1660
1737 /* 1661 /*
1738 * wq doesn't really matter but let's keep @worker->pool 1662 * wq doesn't really matter but let's keep @worker->pool
1739 * and @cwq->pool consistent for sanity. 1663 * and @pwq->pool consistent for sanity.
1740 */ 1664 */
1741 if (worker_pool_pri(worker->pool)) 1665 if (std_worker_pool_pri(worker->pool))
1742 wq = system_highpri_wq; 1666 wq = system_highpri_wq;
1743 else 1667 else
1744 wq = system_wq; 1668 wq = system_wq;
1745 1669
1746 insert_work(get_cwq(gcwq->cpu, wq), rebind_work, 1670 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1747 worker->scheduled.next, 1671 worker->scheduled.next,
1748 work_color_to_flags(WORK_NO_COLOR)); 1672 work_color_to_flags(WORK_NO_COLOR));
1749 } 1673 }
1750} 1674}
1751 1675
@@ -1780,19 +1704,18 @@ static struct worker *alloc_worker(void)
1780 */ 1704 */
1781static struct worker *create_worker(struct worker_pool *pool) 1705static struct worker *create_worker(struct worker_pool *pool)
1782{ 1706{
1783 struct global_cwq *gcwq = pool->gcwq; 1707 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1784 const char *pri = worker_pool_pri(pool) ? "H" : "";
1785 struct worker *worker = NULL; 1708 struct worker *worker = NULL;
1786 int id = -1; 1709 int id = -1;
1787 1710
1788 spin_lock_irq(&gcwq->lock); 1711 spin_lock_irq(&pool->lock);
1789 while (ida_get_new(&pool->worker_ida, &id)) { 1712 while (ida_get_new(&pool->worker_ida, &id)) {
1790 spin_unlock_irq(&gcwq->lock); 1713 spin_unlock_irq(&pool->lock);
1791 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1714 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1792 goto fail; 1715 goto fail;
1793 spin_lock_irq(&gcwq->lock); 1716 spin_lock_irq(&pool->lock);
1794 } 1717 }
1795 spin_unlock_irq(&gcwq->lock); 1718 spin_unlock_irq(&pool->lock);
1796 1719
1797 worker = alloc_worker(); 1720 worker = alloc_worker();
1798 if (!worker) 1721 if (!worker)
@@ -1801,30 +1724,30 @@ static struct worker *create_worker(struct worker_pool *pool)
1801 worker->pool = pool; 1724 worker->pool = pool;
1802 worker->id = id; 1725 worker->id = id;
1803 1726
1804 if (gcwq->cpu != WORK_CPU_UNBOUND) 1727 if (pool->cpu != WORK_CPU_UNBOUND)
1805 worker->task = kthread_create_on_node(worker_thread, 1728 worker->task = kthread_create_on_node(worker_thread,
1806 worker, cpu_to_node(gcwq->cpu), 1729 worker, cpu_to_node(pool->cpu),
1807 "kworker/%u:%d%s", gcwq->cpu, id, pri); 1730 "kworker/%u:%d%s", pool->cpu, id, pri);
1808 else 1731 else
1809 worker->task = kthread_create(worker_thread, worker, 1732 worker->task = kthread_create(worker_thread, worker,
1810 "kworker/u:%d%s", id, pri); 1733 "kworker/u:%d%s", id, pri);
1811 if (IS_ERR(worker->task)) 1734 if (IS_ERR(worker->task))
1812 goto fail; 1735 goto fail;
1813 1736
1814 if (worker_pool_pri(pool)) 1737 if (std_worker_pool_pri(pool))
1815 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1738 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1816 1739
1817 /* 1740 /*
1818 * Determine CPU binding of the new worker depending on 1741 * Determine CPU binding of the new worker depending on
1819 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the 1742 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
1820 * flag remains stable across this function. See the comments 1743 * flag remains stable across this function. See the comments
1821 * above the flag definition for details. 1744 * above the flag definition for details.
1822 * 1745 *
1823 * As an unbound worker may later become a regular one if CPU comes 1746 * As an unbound worker may later become a regular one if CPU comes
1824 * online, make sure every worker has %PF_THREAD_BOUND set. 1747 * online, make sure every worker has %PF_THREAD_BOUND set.
1825 */ 1748 */
1826 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { 1749 if (!(pool->flags & POOL_DISASSOCIATED)) {
1827 kthread_bind(worker->task, gcwq->cpu); 1750 kthread_bind(worker->task, pool->cpu);
1828 } else { 1751 } else {
1829 worker->task->flags |= PF_THREAD_BOUND; 1752 worker->task->flags |= PF_THREAD_BOUND;
1830 worker->flags |= WORKER_UNBOUND; 1753 worker->flags |= WORKER_UNBOUND;
@@ -1833,9 +1756,9 @@ static struct worker *create_worker(struct worker_pool *pool)
1833 return worker; 1756 return worker;
1834fail: 1757fail:
1835 if (id >= 0) { 1758 if (id >= 0) {
1836 spin_lock_irq(&gcwq->lock); 1759 spin_lock_irq(&pool->lock);
1837 ida_remove(&pool->worker_ida, id); 1760 ida_remove(&pool->worker_ida, id);
1838 spin_unlock_irq(&gcwq->lock); 1761 spin_unlock_irq(&pool->lock);
1839 } 1762 }
1840 kfree(worker); 1763 kfree(worker);
1841 return NULL; 1764 return NULL;
@@ -1845,10 +1768,10 @@ fail:
1845 * start_worker - start a newly created worker 1768 * start_worker - start a newly created worker
1846 * @worker: worker to start 1769 * @worker: worker to start
1847 * 1770 *
1848 * Make the gcwq aware of @worker and start it. 1771 * Make the pool aware of @worker and start it.
1849 * 1772 *
1850 * CONTEXT: 1773 * CONTEXT:
1851 * spin_lock_irq(gcwq->lock). 1774 * spin_lock_irq(pool->lock).
1852 */ 1775 */
1853static void start_worker(struct worker *worker) 1776static void start_worker(struct worker *worker)
1854{ 1777{
@@ -1862,15 +1785,14 @@ static void start_worker(struct worker *worker)
1862 * destroy_worker - destroy a workqueue worker 1785 * destroy_worker - destroy a workqueue worker
1863 * @worker: worker to be destroyed 1786 * @worker: worker to be destroyed
1864 * 1787 *
1865 * Destroy @worker and adjust @gcwq stats accordingly. 1788 * Destroy @worker and adjust @pool stats accordingly.
1866 * 1789 *
1867 * CONTEXT: 1790 * CONTEXT:
1868 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 1791 * spin_lock_irq(pool->lock) which is released and regrabbed.
1869 */ 1792 */
1870static void destroy_worker(struct worker *worker) 1793static void destroy_worker(struct worker *worker)
1871{ 1794{
1872 struct worker_pool *pool = worker->pool; 1795 struct worker_pool *pool = worker->pool;
1873 struct global_cwq *gcwq = pool->gcwq;
1874 int id = worker->id; 1796 int id = worker->id;
1875 1797
1876 /* sanity check frenzy */ 1798 /* sanity check frenzy */
@@ -1885,21 +1807,20 @@ static void destroy_worker(struct worker *worker)
1885 list_del_init(&worker->entry); 1807 list_del_init(&worker->entry);
1886 worker->flags |= WORKER_DIE; 1808 worker->flags |= WORKER_DIE;
1887 1809
1888 spin_unlock_irq(&gcwq->lock); 1810 spin_unlock_irq(&pool->lock);
1889 1811
1890 kthread_stop(worker->task); 1812 kthread_stop(worker->task);
1891 kfree(worker); 1813 kfree(worker);
1892 1814
1893 spin_lock_irq(&gcwq->lock); 1815 spin_lock_irq(&pool->lock);
1894 ida_remove(&pool->worker_ida, id); 1816 ida_remove(&pool->worker_ida, id);
1895} 1817}
1896 1818
1897static void idle_worker_timeout(unsigned long __pool) 1819static void idle_worker_timeout(unsigned long __pool)
1898{ 1820{
1899 struct worker_pool *pool = (void *)__pool; 1821 struct worker_pool *pool = (void *)__pool;
1900 struct global_cwq *gcwq = pool->gcwq;
1901 1822
1902 spin_lock_irq(&gcwq->lock); 1823 spin_lock_irq(&pool->lock);
1903 1824
1904 if (too_many_workers(pool)) { 1825 if (too_many_workers(pool)) {
1905 struct worker *worker; 1826 struct worker *worker;
@@ -1918,20 +1839,20 @@ static void idle_worker_timeout(unsigned long __pool)
1918 } 1839 }
1919 } 1840 }
1920 1841
1921 spin_unlock_irq(&gcwq->lock); 1842 spin_unlock_irq(&pool->lock);
1922} 1843}
1923 1844
1924static bool send_mayday(struct work_struct *work) 1845static bool send_mayday(struct work_struct *work)
1925{ 1846{
1926 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1847 struct pool_workqueue *pwq = get_work_pwq(work);
1927 struct workqueue_struct *wq = cwq->wq; 1848 struct workqueue_struct *wq = pwq->wq;
1928 unsigned int cpu; 1849 unsigned int cpu;
1929 1850
1930 if (!(wq->flags & WQ_RESCUER)) 1851 if (!(wq->flags & WQ_RESCUER))
1931 return false; 1852 return false;
1932 1853
1933 /* mayday mayday mayday */ 1854 /* mayday mayday mayday */
1934 cpu = cwq->pool->gcwq->cpu; 1855 cpu = pwq->pool->cpu;
1935 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1856 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1936 if (cpu == WORK_CPU_UNBOUND) 1857 if (cpu == WORK_CPU_UNBOUND)
1937 cpu = 0; 1858 cpu = 0;
@@ -1940,13 +1861,12 @@ static bool send_mayday(struct work_struct *work)
1940 return true; 1861 return true;
1941} 1862}
1942 1863
1943static void gcwq_mayday_timeout(unsigned long __pool) 1864static void pool_mayday_timeout(unsigned long __pool)
1944{ 1865{
1945 struct worker_pool *pool = (void *)__pool; 1866 struct worker_pool *pool = (void *)__pool;
1946 struct global_cwq *gcwq = pool->gcwq;
1947 struct work_struct *work; 1867 struct work_struct *work;
1948 1868
1949 spin_lock_irq(&gcwq->lock); 1869 spin_lock_irq(&pool->lock);
1950 1870
1951 if (need_to_create_worker(pool)) { 1871 if (need_to_create_worker(pool)) {
1952 /* 1872 /*
@@ -1959,7 +1879,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1959 send_mayday(work); 1879 send_mayday(work);
1960 } 1880 }
1961 1881
1962 spin_unlock_irq(&gcwq->lock); 1882 spin_unlock_irq(&pool->lock);
1963 1883
1964 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1884 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1965} 1885}
@@ -1978,24 +1898,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1978 * may_start_working() true. 1898 * may_start_working() true.
1979 * 1899 *
1980 * LOCKING: 1900 * LOCKING:
1981 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1901 * spin_lock_irq(pool->lock) which may be released and regrabbed
1982 * multiple times. Does GFP_KERNEL allocations. Called only from 1902 * multiple times. Does GFP_KERNEL allocations. Called only from
1983 * manager. 1903 * manager.
1984 * 1904 *
1985 * RETURNS: 1905 * RETURNS:
1986 * false if no action was taken and gcwq->lock stayed locked, true 1906 * false if no action was taken and pool->lock stayed locked, true
1987 * otherwise. 1907 * otherwise.
1988 */ 1908 */
1989static bool maybe_create_worker(struct worker_pool *pool) 1909static bool maybe_create_worker(struct worker_pool *pool)
1990__releases(&gcwq->lock) 1910__releases(&pool->lock)
1991__acquires(&gcwq->lock) 1911__acquires(&pool->lock)
1992{ 1912{
1993 struct global_cwq *gcwq = pool->gcwq;
1994
1995 if (!need_to_create_worker(pool)) 1913 if (!need_to_create_worker(pool))
1996 return false; 1914 return false;
1997restart: 1915restart:
1998 spin_unlock_irq(&gcwq->lock); 1916 spin_unlock_irq(&pool->lock);
1999 1917
2000 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1918 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
2001 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1919 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2006,7 +1924,7 @@ restart:
2006 worker = create_worker(pool); 1924 worker = create_worker(pool);
2007 if (worker) { 1925 if (worker) {
2008 del_timer_sync(&pool->mayday_timer); 1926 del_timer_sync(&pool->mayday_timer);
2009 spin_lock_irq(&gcwq->lock); 1927 spin_lock_irq(&pool->lock);
2010 start_worker(worker); 1928 start_worker(worker);
2011 BUG_ON(need_to_create_worker(pool)); 1929 BUG_ON(need_to_create_worker(pool));
2012 return true; 1930 return true;
@@ -2023,7 +1941,7 @@ restart:
2023 } 1941 }
2024 1942
2025 del_timer_sync(&pool->mayday_timer); 1943 del_timer_sync(&pool->mayday_timer);
2026 spin_lock_irq(&gcwq->lock); 1944 spin_lock_irq(&pool->lock);
2027 if (need_to_create_worker(pool)) 1945 if (need_to_create_worker(pool))
2028 goto restart; 1946 goto restart;
2029 return true; 1947 return true;
@@ -2037,11 +1955,11 @@ restart:
2037 * IDLE_WORKER_TIMEOUT. 1955 * IDLE_WORKER_TIMEOUT.
2038 * 1956 *
2039 * LOCKING: 1957 * LOCKING:
2040 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1958 * spin_lock_irq(pool->lock) which may be released and regrabbed
2041 * multiple times. Called only from manager. 1959 * multiple times. Called only from manager.
2042 * 1960 *
2043 * RETURNS: 1961 * RETURNS:
2044 * false if no action was taken and gcwq->lock stayed locked, true 1962 * false if no action was taken and pool->lock stayed locked, true
2045 * otherwise. 1963 * otherwise.
2046 */ 1964 */
2047static bool maybe_destroy_workers(struct worker_pool *pool) 1965static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2071,21 +1989,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2071 * manage_workers - manage worker pool 1989 * manage_workers - manage worker pool
2072 * @worker: self 1990 * @worker: self
2073 * 1991 *
2074 * Assume the manager role and manage gcwq worker pool @worker belongs 1992 * Assume the manager role and manage the worker pool @worker belongs
2075 * to. At any given time, there can be only zero or one manager per 1993 * to. At any given time, there can be only zero or one manager per
2076 * gcwq. The exclusion is handled automatically by this function. 1994 * pool. The exclusion is handled automatically by this function.
2077 * 1995 *
2078 * The caller can safely start processing works on false return. On 1996 * The caller can safely start processing works on false return. On
2079 * true return, it's guaranteed that need_to_create_worker() is false 1997 * true return, it's guaranteed that need_to_create_worker() is false
2080 * and may_start_working() is true. 1998 * and may_start_working() is true.
2081 * 1999 *
2082 * CONTEXT: 2000 * CONTEXT:
2083 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2001 * spin_lock_irq(pool->lock) which may be released and regrabbed
2084 * multiple times. Does GFP_KERNEL allocations. 2002 * multiple times. Does GFP_KERNEL allocations.
2085 * 2003 *
2086 * RETURNS: 2004 * RETURNS:
2087 * false if no action was taken and gcwq->lock stayed locked, true if 2005 * spin_lock_irq(pool->lock) which may be released and regrabbed
2088 * some action was taken. 2006 * multiple times. Does GFP_KERNEL allocations.
2089 */ 2007 */
2090static bool manage_workers(struct worker *worker) 2008static bool manage_workers(struct worker *worker)
2091{ 2009{
@@ -2107,20 +2025,20 @@ static bool manage_workers(struct worker *worker)
2107 * manager against CPU hotplug. 2025 * manager against CPU hotplug.
2108 * 2026 *
2109 * assoc_mutex would always be free unless CPU hotplug is in 2027 * assoc_mutex would always be free unless CPU hotplug is in
2110 * progress. trylock first without dropping @gcwq->lock. 2028 * progress. trylock first without dropping @pool->lock.
2111 */ 2029 */
2112 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2030 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2113 spin_unlock_irq(&pool->gcwq->lock); 2031 spin_unlock_irq(&pool->lock);
2114 mutex_lock(&pool->assoc_mutex); 2032 mutex_lock(&pool->assoc_mutex);
2115 /* 2033 /*
2116 * CPU hotplug could have happened while we were waiting 2034 * CPU hotplug could have happened while we were waiting
2117 * for assoc_mutex. Hotplug itself can't handle us 2035 * for assoc_mutex. Hotplug itself can't handle us
2118 * because manager isn't either on idle or busy list, and 2036 * because manager isn't either on idle or busy list, and
2119 * @gcwq's state and ours could have deviated. 2037 * @pool's state and ours could have deviated.
2120 * 2038 *
2121 * As hotplug is now excluded via assoc_mutex, we can 2039 * As hotplug is now excluded via assoc_mutex, we can
2122 * simply try to bind. It will succeed or fail depending 2040 * simply try to bind. It will succeed or fail depending
2123 * on @gcwq's current state. Try it and adjust 2041 * on @pool's current state. Try it and adjust
2124 * %WORKER_UNBOUND accordingly. 2042 * %WORKER_UNBOUND accordingly.
2125 */ 2043 */
2126 if (worker_maybe_bind_and_lock(worker)) 2044 if (worker_maybe_bind_and_lock(worker))
@@ -2157,18 +2075,15 @@ static bool manage_workers(struct worker *worker)
2157 * call this function to process a work. 2075 * call this function to process a work.
2158 * 2076 *
2159 * CONTEXT: 2077 * CONTEXT:
2160 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 2078 * spin_lock_irq(pool->lock) which is released and regrabbed.
2161 */ 2079 */
2162static void process_one_work(struct worker *worker, struct work_struct *work) 2080static void process_one_work(struct worker *worker, struct work_struct *work)
2163__releases(&gcwq->lock) 2081__releases(&pool->lock)
2164__acquires(&gcwq->lock) 2082__acquires(&pool->lock)
2165{ 2083{
2166 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 2084 struct pool_workqueue *pwq = get_work_pwq(work);
2167 struct worker_pool *pool = worker->pool; 2085 struct worker_pool *pool = worker->pool;
2168 struct global_cwq *gcwq = pool->gcwq; 2086 bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2169 struct hlist_head *bwh = busy_worker_head(gcwq, work);
2170 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
2171 work_func_t f = work->func;
2172 int work_color; 2087 int work_color;
2173 struct worker *collision; 2088 struct worker *collision;
2174#ifdef CONFIG_LOCKDEP 2089#ifdef CONFIG_LOCKDEP
@@ -2186,11 +2101,11 @@ __acquires(&gcwq->lock)
2186 /* 2101 /*
2187 * Ensure we're on the correct CPU. DISASSOCIATED test is 2102 * Ensure we're on the correct CPU. DISASSOCIATED test is
2188 * necessary to avoid spurious warnings from rescuers servicing the 2103 * necessary to avoid spurious warnings from rescuers servicing the
2189 * unbound or a disassociated gcwq. 2104 * unbound or a disassociated pool.
2190 */ 2105 */
2191 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && 2106 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2192 !(gcwq->flags & GCWQ_DISASSOCIATED) && 2107 !(pool->flags & POOL_DISASSOCIATED) &&
2193 raw_smp_processor_id() != gcwq->cpu); 2108 raw_smp_processor_id() != pool->cpu);
2194 2109
2195 /* 2110 /*
2196 * A single work shouldn't be executed concurrently by 2111 * A single work shouldn't be executed concurrently by
@@ -2198,7 +2113,7 @@ __acquires(&gcwq->lock)
2198 * already processing the work. If so, defer the work to the 2113 * already processing the work. If so, defer the work to the
2199 * currently executing one. 2114 * currently executing one.
2200 */ 2115 */
2201 collision = __find_worker_executing_work(gcwq, bwh, work); 2116 collision = find_worker_executing_work(pool, work);
2202 if (unlikely(collision)) { 2117 if (unlikely(collision)) {
2203 move_linked_works(work, &collision->scheduled, NULL); 2118 move_linked_works(work, &collision->scheduled, NULL);
2204 return; 2119 return;
@@ -2206,9 +2121,10 @@ __acquires(&gcwq->lock)
2206 2121
2207 /* claim and dequeue */ 2122 /* claim and dequeue */
2208 debug_work_deactivate(work); 2123 debug_work_deactivate(work);
2209 hlist_add_head(&worker->hentry, bwh); 2124 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
2210 worker->current_work = work; 2125 worker->current_work = work;
2211 worker->current_cwq = cwq; 2126 worker->current_func = work->func;
2127 worker->current_pwq = pwq;
2212 work_color = get_work_color(work); 2128 work_color = get_work_color(work);
2213 2129
2214 list_del_init(&work->entry); 2130 list_del_init(&work->entry);
@@ -2221,53 +2137,55 @@ __acquires(&gcwq->lock)
2221 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 2137 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2222 2138
2223 /* 2139 /*
2224 * Unbound gcwq isn't concurrency managed and work items should be 2140 * Unbound pool isn't concurrency managed and work items should be
2225 * executed ASAP. Wake up another worker if necessary. 2141 * executed ASAP. Wake up another worker if necessary.
2226 */ 2142 */
2227 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2143 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2228 wake_up_worker(pool); 2144 wake_up_worker(pool);
2229 2145
2230 /* 2146 /*
2231 * Record the last CPU and clear PENDING which should be the last 2147 * Record the last pool and clear PENDING which should be the last
2232 * update to @work. Also, do this inside @gcwq->lock so that 2148 * update to @work. Also, do this inside @pool->lock so that
2233 * PENDING and queued state changes happen together while IRQ is 2149 * PENDING and queued state changes happen together while IRQ is
2234 * disabled. 2150 * disabled.
2235 */ 2151 */
2236 set_work_cpu_and_clear_pending(work, gcwq->cpu); 2152 set_work_pool_and_clear_pending(work, pool->id);
2237 2153
2238 spin_unlock_irq(&gcwq->lock); 2154 spin_unlock_irq(&pool->lock);
2239 2155
2240 lock_map_acquire_read(&cwq->wq->lockdep_map); 2156 lock_map_acquire_read(&pwq->wq->lockdep_map);
2241 lock_map_acquire(&lockdep_map); 2157 lock_map_acquire(&lockdep_map);
2242 trace_workqueue_execute_start(work); 2158 trace_workqueue_execute_start(work);
2243 f(work); 2159 worker->current_func(work);
2244 /* 2160 /*
2245 * While we must be careful to not use "work" after this, the trace 2161 * While we must be careful to not use "work" after this, the trace
2246 * point will only record its address. 2162 * point will only record its address.
2247 */ 2163 */
2248 trace_workqueue_execute_end(work); 2164 trace_workqueue_execute_end(work);
2249 lock_map_release(&lockdep_map); 2165 lock_map_release(&lockdep_map);
2250 lock_map_release(&cwq->wq->lockdep_map); 2166 lock_map_release(&pwq->wq->lockdep_map);
2251 2167
2252 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2168 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2253 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2169 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2254 " last function: %pf\n", 2170 " last function: %pf\n",
2255 current->comm, preempt_count(), task_pid_nr(current), f); 2171 current->comm, preempt_count(), task_pid_nr(current),
2172 worker->current_func);
2256 debug_show_held_locks(current); 2173 debug_show_held_locks(current);
2257 dump_stack(); 2174 dump_stack();
2258 } 2175 }
2259 2176
2260 spin_lock_irq(&gcwq->lock); 2177 spin_lock_irq(&pool->lock);
2261 2178
2262 /* clear cpu intensive status */ 2179 /* clear cpu intensive status */
2263 if (unlikely(cpu_intensive)) 2180 if (unlikely(cpu_intensive))
2264 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2181 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2265 2182
2266 /* we're done with it, release */ 2183 /* we're done with it, release */
2267 hlist_del_init(&worker->hentry); 2184 hash_del(&worker->hentry);
2268 worker->current_work = NULL; 2185 worker->current_work = NULL;
2269 worker->current_cwq = NULL; 2186 worker->current_func = NULL;
2270 cwq_dec_nr_in_flight(cwq, work_color); 2187 worker->current_pwq = NULL;
2188 pwq_dec_nr_in_flight(pwq, work_color);
2271} 2189}
2272 2190
2273/** 2191/**
@@ -2279,7 +2197,7 @@ __acquires(&gcwq->lock)
2279 * fetches a work from the top and executes it. 2197 * fetches a work from the top and executes it.
2280 * 2198 *
2281 * CONTEXT: 2199 * CONTEXT:
2282 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2200 * spin_lock_irq(pool->lock) which may be released and regrabbed
2283 * multiple times. 2201 * multiple times.
2284 */ 2202 */
2285static void process_scheduled_works(struct worker *worker) 2203static void process_scheduled_works(struct worker *worker)
@@ -2295,8 +2213,8 @@ static void process_scheduled_works(struct worker *worker)
2295 * worker_thread - the worker thread function 2213 * worker_thread - the worker thread function
2296 * @__worker: self 2214 * @__worker: self
2297 * 2215 *
2298 * The gcwq worker thread function. There's a single dynamic pool of 2216 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
2299 * these per each cpu. These workers process all works regardless of 2217 * of these per each cpu. These workers process all works regardless of
2300 * their specific target workqueue. The only exception is works which 2218 * their specific target workqueue. The only exception is works which
2301 * belong to workqueues with a rescuer which will be explained in 2219 * belong to workqueues with a rescuer which will be explained in
2302 * rescuer_thread(). 2220 * rescuer_thread().
@@ -2305,16 +2223,15 @@ static int worker_thread(void *__worker)
2305{ 2223{
2306 struct worker *worker = __worker; 2224 struct worker *worker = __worker;
2307 struct worker_pool *pool = worker->pool; 2225 struct worker_pool *pool = worker->pool;
2308 struct global_cwq *gcwq = pool->gcwq;
2309 2226
2310 /* tell the scheduler that this is a workqueue worker */ 2227 /* tell the scheduler that this is a workqueue worker */
2311 worker->task->flags |= PF_WQ_WORKER; 2228 worker->task->flags |= PF_WQ_WORKER;
2312woke_up: 2229woke_up:
2313 spin_lock_irq(&gcwq->lock); 2230 spin_lock_irq(&pool->lock);
2314 2231
2315 /* we are off idle list if destruction or rebind is requested */ 2232 /* we are off idle list if destruction or rebind is requested */
2316 if (unlikely(list_empty(&worker->entry))) { 2233 if (unlikely(list_empty(&worker->entry))) {
2317 spin_unlock_irq(&gcwq->lock); 2234 spin_unlock_irq(&pool->lock);
2318 2235
2319 /* if DIE is set, destruction is requested */ 2236 /* if DIE is set, destruction is requested */
2320 if (worker->flags & WORKER_DIE) { 2237 if (worker->flags & WORKER_DIE) {
@@ -2373,52 +2290,59 @@ sleep:
2373 goto recheck; 2290 goto recheck;
2374 2291
2375 /* 2292 /*
2376 * gcwq->lock is held and there's no work to process and no 2293 * pool->lock is held and there's no work to process and no need to
2377 * need to manage, sleep. Workers are woken up only while 2294 * manage, sleep. Workers are woken up only while holding
2378 * holding gcwq->lock or from local cpu, so setting the 2295 * pool->lock or from local cpu, so setting the current state
2379 * current state before releasing gcwq->lock is enough to 2296 * before releasing pool->lock is enough to prevent losing any
2380 * prevent losing any event. 2297 * event.
2381 */ 2298 */
2382 worker_enter_idle(worker); 2299 worker_enter_idle(worker);
2383 __set_current_state(TASK_INTERRUPTIBLE); 2300 __set_current_state(TASK_INTERRUPTIBLE);
2384 spin_unlock_irq(&gcwq->lock); 2301 spin_unlock_irq(&pool->lock);
2385 schedule(); 2302 schedule();
2386 goto woke_up; 2303 goto woke_up;
2387} 2304}
2388 2305
2389/** 2306/**
2390 * rescuer_thread - the rescuer thread function 2307 * rescuer_thread - the rescuer thread function
2391 * @__wq: the associated workqueue 2308 * @__rescuer: self
2392 * 2309 *
2393 * Workqueue rescuer thread function. There's one rescuer for each 2310 * Workqueue rescuer thread function. There's one rescuer for each
2394 * workqueue which has WQ_RESCUER set. 2311 * workqueue which has WQ_RESCUER set.
2395 * 2312 *
2396 * Regular work processing on a gcwq may block trying to create a new 2313 * Regular work processing on a pool may block trying to create a new
2397 * worker which uses GFP_KERNEL allocation which has slight chance of 2314 * worker which uses GFP_KERNEL allocation which has slight chance of
2398 * developing into deadlock if some works currently on the same queue 2315 * developing into deadlock if some works currently on the same queue
2399 * need to be processed to satisfy the GFP_KERNEL allocation. This is 2316 * need to be processed to satisfy the GFP_KERNEL allocation. This is
2400 * the problem rescuer solves. 2317 * the problem rescuer solves.
2401 * 2318 *
2402 * When such condition is possible, the gcwq summons rescuers of all 2319 * When such condition is possible, the pool summons rescuers of all
2403 * workqueues which have works queued on the gcwq and let them process 2320 * workqueues which have works queued on the pool and let them process
2404 * those works so that forward progress can be guaranteed. 2321 * those works so that forward progress can be guaranteed.
2405 * 2322 *
2406 * This should happen rarely. 2323 * This should happen rarely.
2407 */ 2324 */
2408static int rescuer_thread(void *__wq) 2325static int rescuer_thread(void *__rescuer)
2409{ 2326{
2410 struct workqueue_struct *wq = __wq; 2327 struct worker *rescuer = __rescuer;
2411 struct worker *rescuer = wq->rescuer; 2328 struct workqueue_struct *wq = rescuer->rescue_wq;
2412 struct list_head *scheduled = &rescuer->scheduled; 2329 struct list_head *scheduled = &rescuer->scheduled;
2413 bool is_unbound = wq->flags & WQ_UNBOUND; 2330 bool is_unbound = wq->flags & WQ_UNBOUND;
2414 unsigned int cpu; 2331 unsigned int cpu;
2415 2332
2416 set_user_nice(current, RESCUER_NICE_LEVEL); 2333 set_user_nice(current, RESCUER_NICE_LEVEL);
2334
2335 /*
2336 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
2337 * doesn't participate in concurrency management.
2338 */
2339 rescuer->task->flags |= PF_WQ_WORKER;
2417repeat: 2340repeat:
2418 set_current_state(TASK_INTERRUPTIBLE); 2341 set_current_state(TASK_INTERRUPTIBLE);
2419 2342
2420 if (kthread_should_stop()) { 2343 if (kthread_should_stop()) {
2421 __set_current_state(TASK_RUNNING); 2344 __set_current_state(TASK_RUNNING);
2345 rescuer->task->flags &= ~PF_WQ_WORKER;
2422 return 0; 2346 return 0;
2423 } 2347 }
2424 2348
@@ -2428,9 +2352,8 @@ repeat:
2428 */ 2352 */
2429 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2353 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2430 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2354 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2431 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2355 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2432 struct worker_pool *pool = cwq->pool; 2356 struct worker_pool *pool = pwq->pool;
2433 struct global_cwq *gcwq = pool->gcwq;
2434 struct work_struct *work, *n; 2357 struct work_struct *work, *n;
2435 2358
2436 __set_current_state(TASK_RUNNING); 2359 __set_current_state(TASK_RUNNING);
@@ -2446,22 +2369,24 @@ repeat:
2446 */ 2369 */
2447 BUG_ON(!list_empty(&rescuer->scheduled)); 2370 BUG_ON(!list_empty(&rescuer->scheduled));
2448 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2371 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2449 if (get_work_cwq(work) == cwq) 2372 if (get_work_pwq(work) == pwq)
2450 move_linked_works(work, scheduled, &n); 2373 move_linked_works(work, scheduled, &n);
2451 2374
2452 process_scheduled_works(rescuer); 2375 process_scheduled_works(rescuer);
2453 2376
2454 /* 2377 /*
2455 * Leave this gcwq. If keep_working() is %true, notify a 2378 * Leave this pool. If keep_working() is %true, notify a
2456 * regular worker; otherwise, we end up with 0 concurrency 2379 * regular worker; otherwise, we end up with 0 concurrency
2457 * and stalling the execution. 2380 * and stalling the execution.
2458 */ 2381 */
2459 if (keep_working(pool)) 2382 if (keep_working(pool))
2460 wake_up_worker(pool); 2383 wake_up_worker(pool);
2461 2384
2462 spin_unlock_irq(&gcwq->lock); 2385 spin_unlock_irq(&pool->lock);
2463 } 2386 }
2464 2387
2388 /* rescuers should never participate in concurrency management */
2389 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2465 schedule(); 2390 schedule();
2466 goto repeat; 2391 goto repeat;
2467} 2392}
@@ -2479,7 +2404,7 @@ static void wq_barrier_func(struct work_struct *work)
2479 2404
2480/** 2405/**
2481 * insert_wq_barrier - insert a barrier work 2406 * insert_wq_barrier - insert a barrier work
2482 * @cwq: cwq to insert barrier into 2407 * @pwq: pwq to insert barrier into
2483 * @barr: wq_barrier to insert 2408 * @barr: wq_barrier to insert
2484 * @target: target work to attach @barr to 2409 * @target: target work to attach @barr to
2485 * @worker: worker currently executing @target, NULL if @target is not executing 2410 * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2496,12 +2421,12 @@ static void wq_barrier_func(struct work_struct *work)
2496 * after a work with LINKED flag set. 2421 * after a work with LINKED flag set.
2497 * 2422 *
2498 * Note that when @worker is non-NULL, @target may be modified 2423 * Note that when @worker is non-NULL, @target may be modified
2499 * underneath us, so we can't reliably determine cwq from @target. 2424 * underneath us, so we can't reliably determine pwq from @target.
2500 * 2425 *
2501 * CONTEXT: 2426 * CONTEXT:
2502 * spin_lock_irq(gcwq->lock). 2427 * spin_lock_irq(pool->lock).
2503 */ 2428 */
2504static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2429static void insert_wq_barrier(struct pool_workqueue *pwq,
2505 struct wq_barrier *barr, 2430 struct wq_barrier *barr,
2506 struct work_struct *target, struct worker *worker) 2431 struct work_struct *target, struct worker *worker)
2507{ 2432{
@@ -2509,7 +2434,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2509 unsigned int linked = 0; 2434 unsigned int linked = 0;
2510 2435
2511 /* 2436 /*
2512 * debugobject calls are safe here even with gcwq->lock locked 2437 * debugobject calls are safe here even with pool->lock locked
2513 * as we know for sure that this will not trigger any of the 2438 * as we know for sure that this will not trigger any of the
2514 * checks and call back into the fixup functions where we 2439 * checks and call back into the fixup functions where we
2515 * might deadlock. 2440 * might deadlock.
@@ -2534,23 +2459,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2534 } 2459 }
2535 2460
2536 debug_work_activate(&barr->work); 2461 debug_work_activate(&barr->work);
2537 insert_work(cwq, &barr->work, head, 2462 insert_work(pwq, &barr->work, head,
2538 work_color_to_flags(WORK_NO_COLOR) | linked); 2463 work_color_to_flags(WORK_NO_COLOR) | linked);
2539} 2464}
2540 2465
2541/** 2466/**
2542 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing 2467 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2543 * @wq: workqueue being flushed 2468 * @wq: workqueue being flushed
2544 * @flush_color: new flush color, < 0 for no-op 2469 * @flush_color: new flush color, < 0 for no-op
2545 * @work_color: new work color, < 0 for no-op 2470 * @work_color: new work color, < 0 for no-op
2546 * 2471 *
2547 * Prepare cwqs for workqueue flushing. 2472 * Prepare pwqs for workqueue flushing.
2548 * 2473 *
2549 * If @flush_color is non-negative, flush_color on all cwqs should be 2474 * If @flush_color is non-negative, flush_color on all pwqs should be
2550 * -1. If no cwq has in-flight commands at the specified color, all 2475 * -1. If no pwq has in-flight commands at the specified color, all
2551 * cwq->flush_color's stay at -1 and %false is returned. If any cwq 2476 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
2552 * has in flight commands, its cwq->flush_color is set to 2477 * has in flight commands, its pwq->flush_color is set to
2553 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq 2478 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2554 * wakeup logic is armed and %true is returned. 2479 * wakeup logic is armed and %true is returned.
2555 * 2480 *
2556 * The caller should have initialized @wq->first_flusher prior to 2481 * The caller should have initialized @wq->first_flusher prior to
@@ -2558,7 +2483,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2558 * @flush_color is negative, no flush color update is done and %false 2483 * @flush_color is negative, no flush color update is done and %false
2559 * is returned. 2484 * is returned.
2560 * 2485 *
2561 * If @work_color is non-negative, all cwqs should have the same 2486 * If @work_color is non-negative, all pwqs should have the same
2562 * work_color which is previous to @work_color and all will be 2487 * work_color which is previous to @work_color and all will be
2563 * advanced to @work_color. 2488 * advanced to @work_color.
2564 * 2489 *
@@ -2569,42 +2494,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2569 * %true if @flush_color >= 0 and there's something to flush. %false 2494 * %true if @flush_color >= 0 and there's something to flush. %false
2570 * otherwise. 2495 * otherwise.
2571 */ 2496 */
2572static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, 2497static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2573 int flush_color, int work_color) 2498 int flush_color, int work_color)
2574{ 2499{
2575 bool wait = false; 2500 bool wait = false;
2576 unsigned int cpu; 2501 unsigned int cpu;
2577 2502
2578 if (flush_color >= 0) { 2503 if (flush_color >= 0) {
2579 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); 2504 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
2580 atomic_set(&wq->nr_cwqs_to_flush, 1); 2505 atomic_set(&wq->nr_pwqs_to_flush, 1);
2581 } 2506 }
2582 2507
2583 for_each_cwq_cpu(cpu, wq) { 2508 for_each_pwq_cpu(cpu, wq) {
2584 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2509 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2585 struct global_cwq *gcwq = cwq->pool->gcwq; 2510 struct worker_pool *pool = pwq->pool;
2586 2511
2587 spin_lock_irq(&gcwq->lock); 2512 spin_lock_irq(&pool->lock);
2588 2513
2589 if (flush_color >= 0) { 2514 if (flush_color >= 0) {
2590 BUG_ON(cwq->flush_color != -1); 2515 BUG_ON(pwq->flush_color != -1);
2591 2516
2592 if (cwq->nr_in_flight[flush_color]) { 2517 if (pwq->nr_in_flight[flush_color]) {
2593 cwq->flush_color = flush_color; 2518 pwq->flush_color = flush_color;
2594 atomic_inc(&wq->nr_cwqs_to_flush); 2519 atomic_inc(&wq->nr_pwqs_to_flush);
2595 wait = true; 2520 wait = true;
2596 } 2521 }
2597 } 2522 }
2598 2523
2599 if (work_color >= 0) { 2524 if (work_color >= 0) {
2600 BUG_ON(work_color != work_next_color(cwq->work_color)); 2525 BUG_ON(work_color != work_next_color(pwq->work_color));
2601 cwq->work_color = work_color; 2526 pwq->work_color = work_color;
2602 } 2527 }
2603 2528
2604 spin_unlock_irq(&gcwq->lock); 2529 spin_unlock_irq(&pool->lock);
2605 } 2530 }
2606 2531
2607 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) 2532 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2608 complete(&wq->first_flusher->done); 2533 complete(&wq->first_flusher->done);
2609 2534
2610 return wait; 2535 return wait;
@@ -2655,7 +2580,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2655 2580
2656 wq->first_flusher = &this_flusher; 2581 wq->first_flusher = &this_flusher;
2657 2582
2658 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, 2583 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2659 wq->work_color)) { 2584 wq->work_color)) {
2660 /* nothing to flush, done */ 2585 /* nothing to flush, done */
2661 wq->flush_color = next_color; 2586 wq->flush_color = next_color;
@@ -2666,7 +2591,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2666 /* wait in queue */ 2591 /* wait in queue */
2667 BUG_ON(wq->flush_color == this_flusher.flush_color); 2592 BUG_ON(wq->flush_color == this_flusher.flush_color);
2668 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2593 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2669 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2594 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2670 } 2595 }
2671 } else { 2596 } else {
2672 /* 2597 /*
@@ -2733,7 +2658,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2733 2658
2734 list_splice_tail_init(&wq->flusher_overflow, 2659 list_splice_tail_init(&wq->flusher_overflow,
2735 &wq->flusher_queue); 2660 &wq->flusher_queue);
2736 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2661 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2737 } 2662 }
2738 2663
2739 if (list_empty(&wq->flusher_queue)) { 2664 if (list_empty(&wq->flusher_queue)) {
@@ -2743,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2743 2668
2744 /* 2669 /*
2745 * Need to flush more colors. Make the next flusher 2670 * Need to flush more colors. Make the next flusher
2746 * the new first flusher and arm cwqs. 2671 * the new first flusher and arm pwqs.
2747 */ 2672 */
2748 BUG_ON(wq->flush_color == wq->work_color); 2673 BUG_ON(wq->flush_color == wq->work_color);
2749 BUG_ON(wq->flush_color != next->flush_color); 2674 BUG_ON(wq->flush_color != next->flush_color);
@@ -2751,7 +2676,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2751 list_del_init(&next->list); 2676 list_del_init(&next->list);
2752 wq->first_flusher = next; 2677 wq->first_flusher = next;
2753 2678
2754 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) 2679 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2755 break; 2680 break;
2756 2681
2757 /* 2682 /*
@@ -2794,13 +2719,13 @@ void drain_workqueue(struct workqueue_struct *wq)
2794reflush: 2719reflush:
2795 flush_workqueue(wq); 2720 flush_workqueue(wq);
2796 2721
2797 for_each_cwq_cpu(cpu, wq) { 2722 for_each_pwq_cpu(cpu, wq) {
2798 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2723 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2799 bool drained; 2724 bool drained;
2800 2725
2801 spin_lock_irq(&cwq->pool->gcwq->lock); 2726 spin_lock_irq(&pwq->pool->lock);
2802 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2727 drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
2803 spin_unlock_irq(&cwq->pool->gcwq->lock); 2728 spin_unlock_irq(&pwq->pool->lock);
2804 2729
2805 if (drained) 2730 if (drained)
2806 continue; 2731 continue;
@@ -2822,34 +2747,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
2822static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) 2747static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2823{ 2748{
2824 struct worker *worker = NULL; 2749 struct worker *worker = NULL;
2825 struct global_cwq *gcwq; 2750 struct worker_pool *pool;
2826 struct cpu_workqueue_struct *cwq; 2751 struct pool_workqueue *pwq;
2827 2752
2828 might_sleep(); 2753 might_sleep();
2829 gcwq = get_work_gcwq(work); 2754 pool = get_work_pool(work);
2830 if (!gcwq) 2755 if (!pool)
2831 return false; 2756 return false;
2832 2757
2833 spin_lock_irq(&gcwq->lock); 2758 spin_lock_irq(&pool->lock);
2834 if (!list_empty(&work->entry)) { 2759 /* see the comment in try_to_grab_pending() with the same code */
2835 /* 2760 pwq = get_work_pwq(work);
2836 * See the comment near try_to_grab_pending()->smp_rmb(). 2761 if (pwq) {
2837 * If it was re-queued to a different gcwq under us, we 2762 if (unlikely(pwq->pool != pool))
2838 * are not going to wait.
2839 */
2840 smp_rmb();
2841 cwq = get_work_cwq(work);
2842 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2843 goto already_gone; 2763 goto already_gone;
2844 } else { 2764 } else {
2845 worker = find_worker_executing_work(gcwq, work); 2765 worker = find_worker_executing_work(pool, work);
2846 if (!worker) 2766 if (!worker)
2847 goto already_gone; 2767 goto already_gone;
2848 cwq = worker->current_cwq; 2768 pwq = worker->current_pwq;
2849 } 2769 }
2850 2770
2851 insert_wq_barrier(cwq, barr, work, worker); 2771 insert_wq_barrier(pwq, barr, work, worker);
2852 spin_unlock_irq(&gcwq->lock); 2772 spin_unlock_irq(&pool->lock);
2853 2773
2854 /* 2774 /*
2855 * If @max_active is 1 or rescuer is in use, flushing another work 2775 * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2857,15 +2777,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2857 * flusher is not running on the same workqueue by verifying write 2777 * flusher is not running on the same workqueue by verifying write
2858 * access. 2778 * access.
2859 */ 2779 */
2860 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) 2780 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
2861 lock_map_acquire(&cwq->wq->lockdep_map); 2781 lock_map_acquire(&pwq->wq->lockdep_map);
2862 else 2782 else
2863 lock_map_acquire_read(&cwq->wq->lockdep_map); 2783 lock_map_acquire_read(&pwq->wq->lockdep_map);
2864 lock_map_release(&cwq->wq->lockdep_map); 2784 lock_map_release(&pwq->wq->lockdep_map);
2865 2785
2866 return true; 2786 return true;
2867already_gone: 2787already_gone:
2868 spin_unlock_irq(&gcwq->lock); 2788 spin_unlock_irq(&pool->lock);
2869 return false; 2789 return false;
2870} 2790}
2871 2791
@@ -2961,8 +2881,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
2961{ 2881{
2962 local_irq_disable(); 2882 local_irq_disable();
2963 if (del_timer_sync(&dwork->timer)) 2883 if (del_timer_sync(&dwork->timer))
2964 __queue_work(dwork->cpu, 2884 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
2965 get_work_cwq(&dwork->work)->wq, &dwork->work);
2966 local_irq_enable(); 2885 local_irq_enable();
2967 return flush_work(&dwork->work); 2886 return flush_work(&dwork->work);
2968} 2887}
@@ -2992,7 +2911,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
2992 if (unlikely(ret < 0)) 2911 if (unlikely(ret < 0))
2993 return false; 2912 return false;
2994 2913
2995 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); 2914 set_work_pool_and_clear_pending(&dwork->work,
2915 get_work_pool_id(&dwork->work));
2996 local_irq_restore(flags); 2916 local_irq_restore(flags);
2997 return ret; 2917 return ret;
2998} 2918}
@@ -3171,46 +3091,46 @@ int keventd_up(void)
3171 return system_wq != NULL; 3091 return system_wq != NULL;
3172} 3092}
3173 3093
3174static int alloc_cwqs(struct workqueue_struct *wq) 3094static int alloc_pwqs(struct workqueue_struct *wq)
3175{ 3095{
3176 /* 3096 /*
3177 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3097 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
3178 * Make sure that the alignment isn't lower than that of 3098 * Make sure that the alignment isn't lower than that of
3179 * unsigned long long. 3099 * unsigned long long.
3180 */ 3100 */
3181 const size_t size = sizeof(struct cpu_workqueue_struct); 3101 const size_t size = sizeof(struct pool_workqueue);
3182 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3102 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3183 __alignof__(unsigned long long)); 3103 __alignof__(unsigned long long));
3184 3104
3185 if (!(wq->flags & WQ_UNBOUND)) 3105 if (!(wq->flags & WQ_UNBOUND))
3186 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 3106 wq->pool_wq.pcpu = __alloc_percpu(size, align);
3187 else { 3107 else {
3188 void *ptr; 3108 void *ptr;
3189 3109
3190 /* 3110 /*
3191 * Allocate enough room to align cwq and put an extra 3111 * Allocate enough room to align pwq and put an extra
3192 * pointer at the end pointing back to the originally 3112 * pointer at the end pointing back to the originally
3193 * allocated pointer which will be used for free. 3113 * allocated pointer which will be used for free.
3194 */ 3114 */
3195 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3115 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
3196 if (ptr) { 3116 if (ptr) {
3197 wq->cpu_wq.single = PTR_ALIGN(ptr, align); 3117 wq->pool_wq.single = PTR_ALIGN(ptr, align);
3198 *(void **)(wq->cpu_wq.single + 1) = ptr; 3118 *(void **)(wq->pool_wq.single + 1) = ptr;
3199 } 3119 }
3200 } 3120 }
3201 3121
3202 /* just in case, make sure it's actually aligned */ 3122 /* just in case, make sure it's actually aligned */
3203 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 3123 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
3204 return wq->cpu_wq.v ? 0 : -ENOMEM; 3124 return wq->pool_wq.v ? 0 : -ENOMEM;
3205} 3125}
3206 3126
3207static void free_cwqs(struct workqueue_struct *wq) 3127static void free_pwqs(struct workqueue_struct *wq)
3208{ 3128{
3209 if (!(wq->flags & WQ_UNBOUND)) 3129 if (!(wq->flags & WQ_UNBOUND))
3210 free_percpu(wq->cpu_wq.pcpu); 3130 free_percpu(wq->pool_wq.pcpu);
3211 else if (wq->cpu_wq.single) { 3131 else if (wq->pool_wq.single) {
3212 /* the pointer to free is stored right after the cwq */ 3132 /* the pointer to free is stored right after the pwq */
3213 kfree(*(void **)(wq->cpu_wq.single + 1)); 3133 kfree(*(void **)(wq->pool_wq.single + 1));
3214 } 3134 }
3215} 3135}
3216 3136
@@ -3264,27 +3184,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3264 wq->flags = flags; 3184 wq->flags = flags;
3265 wq->saved_max_active = max_active; 3185 wq->saved_max_active = max_active;
3266 mutex_init(&wq->flush_mutex); 3186 mutex_init(&wq->flush_mutex);
3267 atomic_set(&wq->nr_cwqs_to_flush, 0); 3187 atomic_set(&wq->nr_pwqs_to_flush, 0);
3268 INIT_LIST_HEAD(&wq->flusher_queue); 3188 INIT_LIST_HEAD(&wq->flusher_queue);
3269 INIT_LIST_HEAD(&wq->flusher_overflow); 3189 INIT_LIST_HEAD(&wq->flusher_overflow);
3270 3190
3271 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3191 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3272 INIT_LIST_HEAD(&wq->list); 3192 INIT_LIST_HEAD(&wq->list);
3273 3193
3274 if (alloc_cwqs(wq) < 0) 3194 if (alloc_pwqs(wq) < 0)
3275 goto err; 3195 goto err;
3276 3196
3277 for_each_cwq_cpu(cpu, wq) { 3197 for_each_pwq_cpu(cpu, wq) {
3278 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3198 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3279 struct global_cwq *gcwq = get_gcwq(cpu); 3199
3280 int pool_idx = (bool)(flags & WQ_HIGHPRI); 3200 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3281 3201 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3282 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3202 pwq->wq = wq;
3283 cwq->pool = &gcwq->pools[pool_idx]; 3203 pwq->flush_color = -1;
3284 cwq->wq = wq; 3204 pwq->max_active = max_active;
3285 cwq->flush_color = -1; 3205 INIT_LIST_HEAD(&pwq->delayed_works);
3286 cwq->max_active = max_active;
3287 INIT_LIST_HEAD(&cwq->delayed_works);
3288 } 3206 }
3289 3207
3290 if (flags & WQ_RESCUER) { 3208 if (flags & WQ_RESCUER) {
@@ -3297,7 +3215,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3297 if (!rescuer) 3215 if (!rescuer)
3298 goto err; 3216 goto err;
3299 3217
3300 rescuer->task = kthread_create(rescuer_thread, wq, "%s", 3218 rescuer->rescue_wq = wq;
3219 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3301 wq->name); 3220 wq->name);
3302 if (IS_ERR(rescuer->task)) 3221 if (IS_ERR(rescuer->task))
3303 goto err; 3222 goto err;
@@ -3314,8 +3233,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3314 spin_lock(&workqueue_lock); 3233 spin_lock(&workqueue_lock);
3315 3234
3316 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 3235 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3317 for_each_cwq_cpu(cpu, wq) 3236 for_each_pwq_cpu(cpu, wq)
3318 get_cwq(cpu, wq)->max_active = 0; 3237 get_pwq(cpu, wq)->max_active = 0;
3319 3238
3320 list_add(&wq->list, &workqueues); 3239 list_add(&wq->list, &workqueues);
3321 3240
@@ -3324,7 +3243,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3324 return wq; 3243 return wq;
3325err: 3244err:
3326 if (wq) { 3245 if (wq) {
3327 free_cwqs(wq); 3246 free_pwqs(wq);
3328 free_mayday_mask(wq->mayday_mask); 3247 free_mayday_mask(wq->mayday_mask);
3329 kfree(wq->rescuer); 3248 kfree(wq->rescuer);
3330 kfree(wq); 3249 kfree(wq);
@@ -3355,14 +3274,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
3355 spin_unlock(&workqueue_lock); 3274 spin_unlock(&workqueue_lock);
3356 3275
3357 /* sanity check */ 3276 /* sanity check */
3358 for_each_cwq_cpu(cpu, wq) { 3277 for_each_pwq_cpu(cpu, wq) {
3359 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3278 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3360 int i; 3279 int i;
3361 3280
3362 for (i = 0; i < WORK_NR_COLORS; i++) 3281 for (i = 0; i < WORK_NR_COLORS; i++)
3363 BUG_ON(cwq->nr_in_flight[i]); 3282 BUG_ON(pwq->nr_in_flight[i]);
3364 BUG_ON(cwq->nr_active); 3283 BUG_ON(pwq->nr_active);
3365 BUG_ON(!list_empty(&cwq->delayed_works)); 3284 BUG_ON(!list_empty(&pwq->delayed_works));
3366 } 3285 }
3367 3286
3368 if (wq->flags & WQ_RESCUER) { 3287 if (wq->flags & WQ_RESCUER) {
@@ -3371,29 +3290,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
3371 kfree(wq->rescuer); 3290 kfree(wq->rescuer);
3372 } 3291 }
3373 3292
3374 free_cwqs(wq); 3293 free_pwqs(wq);
3375 kfree(wq); 3294 kfree(wq);
3376} 3295}
3377EXPORT_SYMBOL_GPL(destroy_workqueue); 3296EXPORT_SYMBOL_GPL(destroy_workqueue);
3378 3297
3379/** 3298/**
3380 * cwq_set_max_active - adjust max_active of a cwq 3299 * pwq_set_max_active - adjust max_active of a pwq
3381 * @cwq: target cpu_workqueue_struct 3300 * @pwq: target pool_workqueue
3382 * @max_active: new max_active value. 3301 * @max_active: new max_active value.
3383 * 3302 *
3384 * Set @cwq->max_active to @max_active and activate delayed works if 3303 * Set @pwq->max_active to @max_active and activate delayed works if
3385 * increased. 3304 * increased.
3386 * 3305 *
3387 * CONTEXT: 3306 * CONTEXT:
3388 * spin_lock_irq(gcwq->lock). 3307 * spin_lock_irq(pool->lock).
3389 */ 3308 */
3390static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) 3309static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3391{ 3310{
3392 cwq->max_active = max_active; 3311 pwq->max_active = max_active;
3393 3312
3394 while (!list_empty(&cwq->delayed_works) && 3313 while (!list_empty(&pwq->delayed_works) &&
3395 cwq->nr_active < cwq->max_active) 3314 pwq->nr_active < pwq->max_active)
3396 cwq_activate_first_delayed(cwq); 3315 pwq_activate_first_delayed(pwq);
3397} 3316}
3398 3317
3399/** 3318/**
@@ -3416,16 +3335,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3416 3335
3417 wq->saved_max_active = max_active; 3336 wq->saved_max_active = max_active;
3418 3337
3419 for_each_cwq_cpu(cpu, wq) { 3338 for_each_pwq_cpu(cpu, wq) {
3420 struct global_cwq *gcwq = get_gcwq(cpu); 3339 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3340 struct worker_pool *pool = pwq->pool;
3421 3341
3422 spin_lock_irq(&gcwq->lock); 3342 spin_lock_irq(&pool->lock);
3423 3343
3424 if (!(wq->flags & WQ_FREEZABLE) || 3344 if (!(wq->flags & WQ_FREEZABLE) ||
3425 !(gcwq->flags & GCWQ_FREEZING)) 3345 !(pool->flags & POOL_FREEZING))
3426 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); 3346 pwq_set_max_active(pwq, max_active);
3427 3347
3428 spin_unlock_irq(&gcwq->lock); 3348 spin_unlock_irq(&pool->lock);
3429 } 3349 }
3430 3350
3431 spin_unlock(&workqueue_lock); 3351 spin_unlock(&workqueue_lock);
@@ -3446,57 +3366,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3446 */ 3366 */
3447bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 3367bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3448{ 3368{
3449 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3369 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3450 3370
3451 return !list_empty(&cwq->delayed_works); 3371 return !list_empty(&pwq->delayed_works);
3452} 3372}
3453EXPORT_SYMBOL_GPL(workqueue_congested); 3373EXPORT_SYMBOL_GPL(workqueue_congested);
3454 3374
3455/** 3375/**
3456 * work_cpu - return the last known associated cpu for @work
3457 * @work: the work of interest
3458 *
3459 * RETURNS:
3460 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
3461 */
3462unsigned int work_cpu(struct work_struct *work)
3463{
3464 struct global_cwq *gcwq = get_work_gcwq(work);
3465
3466 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3467}
3468EXPORT_SYMBOL_GPL(work_cpu);
3469
3470/**
3471 * work_busy - test whether a work is currently pending or running 3376 * work_busy - test whether a work is currently pending or running
3472 * @work: the work to be tested 3377 * @work: the work to be tested
3473 * 3378 *
3474 * Test whether @work is currently pending or running. There is no 3379 * Test whether @work is currently pending or running. There is no
3475 * synchronization around this function and the test result is 3380 * synchronization around this function and the test result is
3476 * unreliable and only useful as advisory hints or for debugging. 3381 * unreliable and only useful as advisory hints or for debugging.
3477 * Especially for reentrant wqs, the pending state might hide the
3478 * running state.
3479 * 3382 *
3480 * RETURNS: 3383 * RETURNS:
3481 * OR'd bitmask of WORK_BUSY_* bits. 3384 * OR'd bitmask of WORK_BUSY_* bits.
3482 */ 3385 */
3483unsigned int work_busy(struct work_struct *work) 3386unsigned int work_busy(struct work_struct *work)
3484{ 3387{
3485 struct global_cwq *gcwq = get_work_gcwq(work); 3388 struct worker_pool *pool = get_work_pool(work);
3486 unsigned long flags; 3389 unsigned long flags;
3487 unsigned int ret = 0; 3390 unsigned int ret = 0;
3488 3391
3489 if (!gcwq)
3490 return 0;
3491
3492 spin_lock_irqsave(&gcwq->lock, flags);
3493
3494 if (work_pending(work)) 3392 if (work_pending(work))
3495 ret |= WORK_BUSY_PENDING; 3393 ret |= WORK_BUSY_PENDING;
3496 if (find_worker_executing_work(gcwq, work))
3497 ret |= WORK_BUSY_RUNNING;
3498 3394
3499 spin_unlock_irqrestore(&gcwq->lock, flags); 3395 if (pool) {
3396 spin_lock_irqsave(&pool->lock, flags);
3397 if (find_worker_executing_work(pool, work))
3398 ret |= WORK_BUSY_RUNNING;
3399 spin_unlock_irqrestore(&pool->lock, flags);
3400 }
3500 3401
3501 return ret; 3402 return ret;
3502} 3403}
@@ -3506,65 +3407,49 @@ EXPORT_SYMBOL_GPL(work_busy);
3506 * CPU hotplug. 3407 * CPU hotplug.
3507 * 3408 *
3508 * There are two challenges in supporting CPU hotplug. Firstly, there 3409 * There are two challenges in supporting CPU hotplug. Firstly, there
3509 * are a lot of assumptions on strong associations among work, cwq and 3410 * are a lot of assumptions on strong associations among work, pwq and
3510 * gcwq which make migrating pending and scheduled works very 3411 * pool which make migrating pending and scheduled works very
3511 * difficult to implement without impacting hot paths. Secondly, 3412 * difficult to implement without impacting hot paths. Secondly,
3512 * gcwqs serve mix of short, long and very long running works making 3413 * worker pools serve mix of short, long and very long running works making
3513 * blocked draining impractical. 3414 * blocked draining impractical.
3514 * 3415 *
3515 * This is solved by allowing a gcwq to be disassociated from the CPU 3416 * This is solved by allowing the pools to be disassociated from the CPU
3516 * running as an unbound one and allowing it to be reattached later if the 3417 * running as an unbound one and allowing it to be reattached later if the
3517 * cpu comes back online. 3418 * cpu comes back online.
3518 */ 3419 */
3519 3420
3520/* claim manager positions of all pools */ 3421static void wq_unbind_fn(struct work_struct *work)
3521static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3522{ 3422{
3523 struct worker_pool *pool; 3423 int cpu = smp_processor_id();
3524
3525 for_each_worker_pool(pool, gcwq)
3526 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3527 spin_lock_irq(&gcwq->lock);
3528}
3529
3530/* release manager positions */
3531static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3532{
3533 struct worker_pool *pool;
3534
3535 spin_unlock_irq(&gcwq->lock);
3536 for_each_worker_pool(pool, gcwq)
3537 mutex_unlock(&pool->assoc_mutex);
3538}
3539
3540static void gcwq_unbind_fn(struct work_struct *work)
3541{
3542 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3543 struct worker_pool *pool; 3424 struct worker_pool *pool;
3544 struct worker *worker; 3425 struct worker *worker;
3545 struct hlist_node *pos; 3426 struct hlist_node *pos;
3546 int i; 3427 int i;
3547 3428
3548 BUG_ON(gcwq->cpu != smp_processor_id()); 3429 for_each_std_worker_pool(pool, cpu) {
3430 BUG_ON(cpu != smp_processor_id());
3549 3431
3550 gcwq_claim_assoc_and_lock(gcwq); 3432 mutex_lock(&pool->assoc_mutex);
3433 spin_lock_irq(&pool->lock);
3551 3434
3552 /* 3435 /*
3553 * We've claimed all manager positions. Make all workers unbound 3436 * We've claimed all manager positions. Make all workers
3554 * and set DISASSOCIATED. Before this, all workers except for the 3437 * unbound and set DISASSOCIATED. Before this, all workers
3555 * ones which are still executing works from before the last CPU 3438 * except for the ones which are still executing works from
3556 * down must be on the cpu. After this, they may become diasporas. 3439 * before the last CPU down must be on the cpu. After
3557 */ 3440 * this, they may become diasporas.
3558 for_each_worker_pool(pool, gcwq) 3441 */
3559 list_for_each_entry(worker, &pool->idle_list, entry) 3442 list_for_each_entry(worker, &pool->idle_list, entry)
3560 worker->flags |= WORKER_UNBOUND; 3443 worker->flags |= WORKER_UNBOUND;
3561 3444
3562 for_each_busy_worker(worker, i, pos, gcwq) 3445 for_each_busy_worker(worker, i, pos, pool)
3563 worker->flags |= WORKER_UNBOUND; 3446 worker->flags |= WORKER_UNBOUND;
3564 3447
3565 gcwq->flags |= GCWQ_DISASSOCIATED; 3448 pool->flags |= POOL_DISASSOCIATED;
3566 3449
3567 gcwq_release_assoc_and_unlock(gcwq); 3450 spin_unlock_irq(&pool->lock);
3451 mutex_unlock(&pool->assoc_mutex);
3452 }
3568 3453
3569 /* 3454 /*
3570 * Call schedule() so that we cross rq->lock and thus can guarantee 3455 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,16 +3461,16 @@ static void gcwq_unbind_fn(struct work_struct *work)
3576 /* 3461 /*
3577 * Sched callbacks are disabled now. Zap nr_running. After this, 3462 * Sched callbacks are disabled now. Zap nr_running. After this,
3578 * nr_running stays zero and need_more_worker() and keep_working() 3463 * nr_running stays zero and need_more_worker() and keep_working()
3579 * are always true as long as the worklist is not empty. @gcwq now 3464 * are always true as long as the worklist is not empty. Pools on
3580 * behaves as unbound (in terms of concurrency management) gcwq 3465 * @cpu now behave as unbound (in terms of concurrency management)
3581 * which is served by workers tied to the CPU. 3466 * pools which are served by workers tied to the CPU.
3582 * 3467 *
3583 * On return from this function, the current worker would trigger 3468 * On return from this function, the current worker would trigger
3584 * unbound chain execution of pending work items if other workers 3469 * unbound chain execution of pending work items if other workers
3585 * didn't already. 3470 * didn't already.
3586 */ 3471 */
3587 for_each_worker_pool(pool, gcwq) 3472 for_each_std_worker_pool(pool, cpu)
3588 atomic_set(get_pool_nr_running(pool), 0); 3473 atomic_set(&pool->nr_running, 0);
3589} 3474}
3590 3475
3591/* 3476/*
@@ -3597,12 +3482,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3597 void *hcpu) 3482 void *hcpu)
3598{ 3483{
3599 unsigned int cpu = (unsigned long)hcpu; 3484 unsigned int cpu = (unsigned long)hcpu;
3600 struct global_cwq *gcwq = get_gcwq(cpu);
3601 struct worker_pool *pool; 3485 struct worker_pool *pool;
3602 3486
3603 switch (action & ~CPU_TASKS_FROZEN) { 3487 switch (action & ~CPU_TASKS_FROZEN) {
3604 case CPU_UP_PREPARE: 3488 case CPU_UP_PREPARE:
3605 for_each_worker_pool(pool, gcwq) { 3489 for_each_std_worker_pool(pool, cpu) {
3606 struct worker *worker; 3490 struct worker *worker;
3607 3491
3608 if (pool->nr_workers) 3492 if (pool->nr_workers)
@@ -3612,18 +3496,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3612 if (!worker) 3496 if (!worker)
3613 return NOTIFY_BAD; 3497 return NOTIFY_BAD;
3614 3498
3615 spin_lock_irq(&gcwq->lock); 3499 spin_lock_irq(&pool->lock);
3616 start_worker(worker); 3500 start_worker(worker);
3617 spin_unlock_irq(&gcwq->lock); 3501 spin_unlock_irq(&pool->lock);
3618 } 3502 }
3619 break; 3503 break;
3620 3504
3621 case CPU_DOWN_FAILED: 3505 case CPU_DOWN_FAILED:
3622 case CPU_ONLINE: 3506 case CPU_ONLINE:
3623 gcwq_claim_assoc_and_lock(gcwq); 3507 for_each_std_worker_pool(pool, cpu) {
3624 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3508 mutex_lock(&pool->assoc_mutex);
3625 rebind_workers(gcwq); 3509 spin_lock_irq(&pool->lock);
3626 gcwq_release_assoc_and_unlock(gcwq); 3510
3511 pool->flags &= ~POOL_DISASSOCIATED;
3512 rebind_workers(pool);
3513
3514 spin_unlock_irq(&pool->lock);
3515 mutex_unlock(&pool->assoc_mutex);
3516 }
3627 break; 3517 break;
3628 } 3518 }
3629 return NOTIFY_OK; 3519 return NOTIFY_OK;
@@ -3643,7 +3533,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3643 switch (action & ~CPU_TASKS_FROZEN) { 3533 switch (action & ~CPU_TASKS_FROZEN) {
3644 case CPU_DOWN_PREPARE: 3534 case CPU_DOWN_PREPARE:
3645 /* unbinding should happen on the local CPU */ 3535 /* unbinding should happen on the local CPU */
3646 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); 3536 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3647 queue_work_on(cpu, system_highpri_wq, &unbind_work); 3537 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3648 flush_work(&unbind_work); 3538 flush_work(&unbind_work);
3649 break; 3539 break;
@@ -3696,10 +3586,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3696 * 3586 *
3697 * Start freezing workqueues. After this function returns, all freezable 3587 * Start freezing workqueues. After this function returns, all freezable
3698 * workqueues will queue new works to their frozen_works list instead of 3588 * workqueues will queue new works to their frozen_works list instead of
3699 * gcwq->worklist. 3589 * pool->worklist.
3700 * 3590 *
3701 * CONTEXT: 3591 * CONTEXT:
3702 * Grabs and releases workqueue_lock and gcwq->lock's. 3592 * Grabs and releases workqueue_lock and pool->lock's.
3703 */ 3593 */
3704void freeze_workqueues_begin(void) 3594void freeze_workqueues_begin(void)
3705{ 3595{
@@ -3710,23 +3600,26 @@ void freeze_workqueues_begin(void)
3710 BUG_ON(workqueue_freezing); 3600 BUG_ON(workqueue_freezing);
3711 workqueue_freezing = true; 3601 workqueue_freezing = true;
3712 3602
3713 for_each_gcwq_cpu(cpu) { 3603 for_each_wq_cpu(cpu) {
3714 struct global_cwq *gcwq = get_gcwq(cpu); 3604 struct worker_pool *pool;
3715 struct workqueue_struct *wq; 3605 struct workqueue_struct *wq;
3716 3606
3717 spin_lock_irq(&gcwq->lock); 3607 for_each_std_worker_pool(pool, cpu) {
3608 spin_lock_irq(&pool->lock);
3718 3609
3719 BUG_ON(gcwq->flags & GCWQ_FREEZING); 3610 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3720 gcwq->flags |= GCWQ_FREEZING; 3611 pool->flags |= POOL_FREEZING;
3721 3612
3722 list_for_each_entry(wq, &workqueues, list) { 3613 list_for_each_entry(wq, &workqueues, list) {
3723 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3614 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3724 3615
3725 if (cwq && wq->flags & WQ_FREEZABLE) 3616 if (pwq && pwq->pool == pool &&
3726 cwq->max_active = 0; 3617 (wq->flags & WQ_FREEZABLE))
3727 } 3618 pwq->max_active = 0;
3619 }
3728 3620
3729 spin_unlock_irq(&gcwq->lock); 3621 spin_unlock_irq(&pool->lock);
3622 }
3730 } 3623 }
3731 3624
3732 spin_unlock(&workqueue_lock); 3625 spin_unlock(&workqueue_lock);
@@ -3754,20 +3647,20 @@ bool freeze_workqueues_busy(void)
3754 3647
3755 BUG_ON(!workqueue_freezing); 3648 BUG_ON(!workqueue_freezing);
3756 3649
3757 for_each_gcwq_cpu(cpu) { 3650 for_each_wq_cpu(cpu) {
3758 struct workqueue_struct *wq; 3651 struct workqueue_struct *wq;
3759 /* 3652 /*
3760 * nr_active is monotonically decreasing. It's safe 3653 * nr_active is monotonically decreasing. It's safe
3761 * to peek without lock. 3654 * to peek without lock.
3762 */ 3655 */
3763 list_for_each_entry(wq, &workqueues, list) { 3656 list_for_each_entry(wq, &workqueues, list) {
3764 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3657 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3765 3658
3766 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3659 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3767 continue; 3660 continue;
3768 3661
3769 BUG_ON(cwq->nr_active < 0); 3662 BUG_ON(pwq->nr_active < 0);
3770 if (cwq->nr_active) { 3663 if (pwq->nr_active) {
3771 busy = true; 3664 busy = true;
3772 goto out_unlock; 3665 goto out_unlock;
3773 } 3666 }
@@ -3782,10 +3675,10 @@ out_unlock:
3782 * thaw_workqueues - thaw workqueues 3675 * thaw_workqueues - thaw workqueues
3783 * 3676 *
3784 * Thaw workqueues. Normal queueing is restored and all collected 3677 * Thaw workqueues. Normal queueing is restored and all collected
3785 * frozen works are transferred to their respective gcwq worklists. 3678 * frozen works are transferred to their respective pool worklists.
3786 * 3679 *
3787 * CONTEXT: 3680 * CONTEXT:
3788 * Grabs and releases workqueue_lock and gcwq->lock's. 3681 * Grabs and releases workqueue_lock and pool->lock's.
3789 */ 3682 */
3790void thaw_workqueues(void) 3683void thaw_workqueues(void)
3791{ 3684{
@@ -3796,30 +3689,31 @@ void thaw_workqueues(void)
3796 if (!workqueue_freezing) 3689 if (!workqueue_freezing)
3797 goto out_unlock; 3690 goto out_unlock;
3798 3691
3799 for_each_gcwq_cpu(cpu) { 3692 for_each_wq_cpu(cpu) {
3800 struct global_cwq *gcwq = get_gcwq(cpu);
3801 struct worker_pool *pool; 3693 struct worker_pool *pool;
3802 struct workqueue_struct *wq; 3694 struct workqueue_struct *wq;
3803 3695
3804 spin_lock_irq(&gcwq->lock); 3696 for_each_std_worker_pool(pool, cpu) {
3697 spin_lock_irq(&pool->lock);
3805 3698
3806 BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); 3699 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
3807 gcwq->flags &= ~GCWQ_FREEZING; 3700 pool->flags &= ~POOL_FREEZING;
3808 3701
3809 list_for_each_entry(wq, &workqueues, list) { 3702 list_for_each_entry(wq, &workqueues, list) {
3810 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3703 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3811 3704
3812 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3705 if (!pwq || pwq->pool != pool ||
3813 continue; 3706 !(wq->flags & WQ_FREEZABLE))
3707 continue;
3814 3708
3815 /* restore max_active and repopulate worklist */ 3709 /* restore max_active and repopulate worklist */
3816 cwq_set_max_active(cwq, wq->saved_max_active); 3710 pwq_set_max_active(pwq, wq->saved_max_active);
3817 } 3711 }
3818 3712
3819 for_each_worker_pool(pool, gcwq)
3820 wake_up_worker(pool); 3713 wake_up_worker(pool);
3821 3714
3822 spin_unlock_irq(&gcwq->lock); 3715 spin_unlock_irq(&pool->lock);
3716 }
3823 } 3717 }
3824 3718
3825 workqueue_freezing = false; 3719 workqueue_freezing = false;
@@ -3831,60 +3725,56 @@ out_unlock:
3831static int __init init_workqueues(void) 3725static int __init init_workqueues(void)
3832{ 3726{
3833 unsigned int cpu; 3727 unsigned int cpu;
3834 int i;
3835 3728
3836 /* make sure we have enough bits for OFFQ CPU number */ 3729 /* make sure we have enough bits for OFFQ pool ID */
3837 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < 3730 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3838 WORK_CPU_LAST); 3731 WORK_CPU_END * NR_STD_WORKER_POOLS);
3839 3732
3840 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 3733 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3841 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 3734 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3842 3735
3843 /* initialize gcwqs */ 3736 /* initialize CPU pools */
3844 for_each_gcwq_cpu(cpu) { 3737 for_each_wq_cpu(cpu) {
3845 struct global_cwq *gcwq = get_gcwq(cpu);
3846 struct worker_pool *pool; 3738 struct worker_pool *pool;
3847 3739
3848 spin_lock_init(&gcwq->lock); 3740 for_each_std_worker_pool(pool, cpu) {
3849 gcwq->cpu = cpu; 3741 spin_lock_init(&pool->lock);
3850 gcwq->flags |= GCWQ_DISASSOCIATED; 3742 pool->cpu = cpu;
3851 3743 pool->flags |= POOL_DISASSOCIATED;
3852 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3853 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3854
3855 for_each_worker_pool(pool, gcwq) {
3856 pool->gcwq = gcwq;
3857 INIT_LIST_HEAD(&pool->worklist); 3744 INIT_LIST_HEAD(&pool->worklist);
3858 INIT_LIST_HEAD(&pool->idle_list); 3745 INIT_LIST_HEAD(&pool->idle_list);
3746 hash_init(pool->busy_hash);
3859 3747
3860 init_timer_deferrable(&pool->idle_timer); 3748 init_timer_deferrable(&pool->idle_timer);
3861 pool->idle_timer.function = idle_worker_timeout; 3749 pool->idle_timer.function = idle_worker_timeout;
3862 pool->idle_timer.data = (unsigned long)pool; 3750 pool->idle_timer.data = (unsigned long)pool;
3863 3751
3864 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3752 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3865 (unsigned long)pool); 3753 (unsigned long)pool);
3866 3754
3867 mutex_init(&pool->assoc_mutex); 3755 mutex_init(&pool->assoc_mutex);
3868 ida_init(&pool->worker_ida); 3756 ida_init(&pool->worker_ida);
3757
3758 /* alloc pool ID */
3759 BUG_ON(worker_pool_assign_id(pool));
3869 } 3760 }
3870 } 3761 }
3871 3762
3872 /* create the initial worker */ 3763 /* create the initial worker */
3873 for_each_online_gcwq_cpu(cpu) { 3764 for_each_online_wq_cpu(cpu) {
3874 struct global_cwq *gcwq = get_gcwq(cpu);
3875 struct worker_pool *pool; 3765 struct worker_pool *pool;
3876 3766
3877 if (cpu != WORK_CPU_UNBOUND) 3767 for_each_std_worker_pool(pool, cpu) {
3878 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3879
3880 for_each_worker_pool(pool, gcwq) {
3881 struct worker *worker; 3768 struct worker *worker;
3882 3769
3770 if (cpu != WORK_CPU_UNBOUND)
3771 pool->flags &= ~POOL_DISASSOCIATED;
3772
3883 worker = create_worker(pool); 3773 worker = create_worker(pool);
3884 BUG_ON(!worker); 3774 BUG_ON(!worker);
3885 spin_lock_irq(&gcwq->lock); 3775 spin_lock_irq(&pool->lock);
3886 start_worker(worker); 3776 start_worker(worker);
3887 spin_unlock_irq(&gcwq->lock); 3777 spin_unlock_irq(&pool->lock);
3888 } 3778 }
3889 } 3779 }
3890 3780
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
1/*
2 * kernel/workqueue_internal.h
3 *
4 * Workqueue internal header file. Only to be included by workqueue and
5 * core kernel subsystems.
6 */
7#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
8#define _KERNEL_WORKQUEUE_INTERNAL_H
9
10#include <linux/workqueue.h>
11#include <linux/kthread.h>
12
13struct worker_pool;
14
15/*
16 * The poor guys doing the actual heavy lifting. All on-duty workers are
17 * either serving the manager role, on idle list or on busy hash. For
18 * details on the locking annotation (L, I, X...), refer to workqueue.c.
19 *
20 * Only to be used in workqueue and async.
21 */
22struct worker {
23 /* on idle list while idle, on busy hash table while busy */
24 union {
25 struct list_head entry; /* L: while idle */
26 struct hlist_node hentry; /* L: while busy */
27 };
28
29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 struct list_head scheduled; /* L: scheduled works */
33 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */
39
40 /* for rebinding worker to CPU */
41 struct work_struct rebind_work; /* L: for busy worker */
42
43 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
45};
46
47/**
48 * current_wq_worker - return struct worker if %current is a workqueue worker
49 */
50static inline struct worker *current_wq_worker(void)
51{
52 if (current->flags & PF_WQ_WORKER)
53 return kthread_data(current);
54 return NULL;
55}
56
57/*
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c.
60 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task,
63 unsigned int cpu);
64
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);