aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/async.c40
-rw-r--r--kernel/audit.c538
-rw-r--r--kernel/audit.h159
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c367
-rw-r--r--kernel/auditsc.c421
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/cgroup.c814
-rw-r--r--kernel/compat.c84
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c118
-rw-r--r--kernel/cpuset.c158
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/events/core.c317
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c36
-rw-r--r--kernel/events/uprobes.c300
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/hrtimer.c33
-rw-r--r--kernel/irq/irqdomain.c29
-rw-r--r--kernel/irq/proc.c20
-rw-r--r--kernel/kallsyms.c26
-rw-r--r--kernel/kexec.c161
-rw-r--r--kernel/kmod.c103
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/kthread.c111
-rw-r--r--kernel/lockdep.c47
-rw-r--r--kernel/modsign_certificate.S13
-rw-r--r--kernel/module.c39
-rw-r--r--kernel/mutex.c151
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/params.c5
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/posix-timers.c121
-rw-r--r--kernel/power/console.c116
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/suspend.c22
-rw-r--r--kernel/printk.c178
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c81
-rw-r--r--kernel/range.c11
-rw-r--r--kernel/rcutree.c276
-rw-r--r--kernel/rcutree.h43
-rw-r--r--kernel/rcutree_plugin.h622
-rw-r--r--kernel/rcutree_trace.c10
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/resource.c198
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/rwsem.c16
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c414
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c240
-rw-r--r--kernel/sched/fair.c158
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle_task.c17
-rw-r--r--kernel/sched/sched.h244
-rw-r--r--kernel/sched/stats.c7
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/semaphore.c8
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/smp.c91
-rw-r--r--kernel/smpboot.c14
-rw-r--r--kernel/softirq.c25
-rw-r--r--kernel/sys.c289
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/test_kprobes.c2
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/Kconfig85
-rw-r--r--kernel/time/ntp.c106
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/tick-broadcast.c251
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c303
-rw-r--r--kernel/time/timekeeping.c404
-rw-r--r--kernel/time/timer_list.c104
-rw-r--r--kernel/timer.c161
-rw-r--r--kernel/trace/Kconfig51
-rw-r--r--kernel/trace/blktrace.c31
-rw-r--r--kernel/trace/ftrace.c294
-rw-r--r--kernel/trace/ring_buffer.c503
-rw-r--r--kernel/trace/trace.c2263
-rw-r--r--kernel/trace/trace.h151
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c10
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_events.c1427
-rw-r--r--kernel/trace/trace_events_filter.c38
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c207
-rw-r--r--kernel/trace/trace_functions_graph.c12
-rw-r--r--kernel/trace/trace_irqsoff.c90
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c320
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c119
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c93
-rw-r--r--kernel/trace/trace_selftest.c53
-rw-r--r--kernel/trace/trace_stack.c78
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c90
-rw-r--r--kernel/trace/trace_uprobe.c203
-rw-r--r--kernel/tracepoint.c21
-rw-r--r--kernel/uid16.c55
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c35
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c2964
-rw-r--r--kernel/workqueue_internal.h19
126 files changed, 12447 insertions, 6117 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
4config_data.h 4config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index bbde5f1a4486..271fd3119af9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ endif
24 24
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += cpu/
27 28
28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_FREEZER) += freezer.o 30obj-$(CONFIG_FREEZER) += freezer.o
@@ -175,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey
175 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ 176 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
176 -batch -x509 -config x509.genkey \ 177 -batch -x509 -config x509.genkey \
177 -outform DER -out signing_key.x509 \ 178 -outform DER -out signing_key.x509 \
178 -keyout signing_key.priv 179 -keyout signing_key.priv 2>&1
179 @echo "###" 180 @echo "###"
180 @echo "### Key pair generated." 181 @echo "### Key pair generated."
181 @echo "###" 182 @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index b9bd7f098ee5..8d6e145138bb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
540 ac.ac_swaps = encode_comp_t(0); 540 ac.ac_swaps = encode_comp_t(0);
541 541
542 /* 542 /*
543 * Get freeze protection. If the fs is frozen, just skip the write
544 * as we could deadlock the system otherwise.
545 */
546 if (!file_start_write_trylock(file))
547 goto out;
548 /*
543 * Kernel segment override to datasegment and write it 549 * Kernel segment override to datasegment and write it
544 * to the accounting file. 550 * to the accounting file.
545 */ 551 */
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
554 sizeof(acct_t), &file->f_pos); 560 sizeof(acct_t), &file->f_pos);
555 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 561 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
556 set_fs(fs); 562 set_fs(fs);
563 file_end_write(file);
557out: 564out:
558 revert_creds(orig_cred); 565 revert_creds(orig_cred);
559} 566}
diff --git a/kernel/async.c b/kernel/async.c
index 8ddee2c3e5b0..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -73,7 +73,7 @@ struct async_entry {
73 struct list_head global_list; 73 struct list_head global_list;
74 struct work_struct work; 74 struct work_struct work;
75 async_cookie_t cookie; 75 async_cookie_t cookie;
76 async_func_ptr *func; 76 async_func_t func;
77 void *data; 77 void *data;
78 struct async_domain *domain; 78 struct async_domain *domain;
79}; 79};
@@ -84,24 +84,20 @@ static atomic_t entry_count;
84 84
85static async_cookie_t lowest_in_progress(struct async_domain *domain) 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
86{ 86{
87 struct async_entry *first = NULL; 87 struct list_head *pending;
88 async_cookie_t ret = ASYNC_COOKIE_MAX; 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
89 unsigned long flags; 89 unsigned long flags;
90 90
91 spin_lock_irqsave(&async_lock, flags); 91 spin_lock_irqsave(&async_lock, flags);
92 92
93 if (domain) { 93 if (domain)
94 if (!list_empty(&domain->pending)) 94 pending = &domain->pending;
95 first = list_first_entry(&domain->pending, 95 else
96 struct async_entry, domain_list); 96 pending = &async_global_pending;
97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
101 }
102 97
103 if (first) 98 if (!list_empty(pending))
104 ret = first->cookie; 99 ret = list_first_entry(pending, struct async_entry,
100 domain_list)->cookie;
105 101
106 spin_unlock_irqrestore(&async_lock, flags); 102 spin_unlock_irqrestore(&async_lock, flags);
107 return ret; 103 return ret;
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
149 wake_up(&async_done); 145 wake_up(&async_done);
150} 146}
151 147
152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) 148static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
153{ 149{
154 struct async_entry *entry; 150 struct async_entry *entry;
155 unsigned long flags; 151 unsigned long flags;
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
169 spin_unlock_irqrestore(&async_lock, flags); 165 spin_unlock_irqrestore(&async_lock, flags);
170 166
171 /* low on memory.. run synchronously */ 167 /* low on memory.. run synchronously */
172 ptr(data, newcookie); 168 func(data, newcookie);
173 return newcookie; 169 return newcookie;
174 } 170 }
175 INIT_LIST_HEAD(&entry->domain_list); 171 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list); 172 INIT_LIST_HEAD(&entry->global_list);
177 INIT_WORK(&entry->work, async_run_entry_fn); 173 INIT_WORK(&entry->work, async_run_entry_fn);
178 entry->func = ptr; 174 entry->func = func;
179 entry->data = data; 175 entry->data = data;
180 entry->domain = domain; 176 entry->domain = domain;
181 177
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
202 198
203/** 199/**
204 * async_schedule - schedule a function for asynchronous execution 200 * async_schedule - schedule a function for asynchronous execution
205 * @ptr: function to execute asynchronously 201 * @func: function to execute asynchronously
206 * @data: data pointer to pass to the function 202 * @data: data pointer to pass to the function
207 * 203 *
208 * Returns an async_cookie_t that may be used for checkpointing later. 204 * Returns an async_cookie_t that may be used for checkpointing later.
209 * Note: This function may be called from atomic or non-atomic contexts. 205 * Note: This function may be called from atomic or non-atomic contexts.
210 */ 206 */
211async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 207async_cookie_t async_schedule(async_func_t func, void *data)
212{ 208{
213 return __async_schedule(ptr, data, &async_dfl_domain); 209 return __async_schedule(func, data, &async_dfl_domain);
214} 210}
215EXPORT_SYMBOL_GPL(async_schedule); 211EXPORT_SYMBOL_GPL(async_schedule);
216 212
217/** 213/**
218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 214 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
219 * @ptr: function to execute asynchronously 215 * @func: function to execute asynchronously
220 * @data: data pointer to pass to the function 216 * @data: data pointer to pass to the function
221 * @domain: the domain 217 * @domain: the domain
222 * 218 *
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
226 * synchronization domain is specified via @domain. Note: This function 222 * synchronization domain is specified via @domain. Note: This function
227 * may be called from atomic or non-atomic contexts. 223 * may be called from atomic or non-atomic contexts.
228 */ 224 */
229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 225async_cookie_t async_schedule_domain(async_func_t func, void *data,
230 struct async_domain *domain) 226 struct async_domain *domain)
231{ 227{
232 return __async_schedule(ptr, data, domain); 228 return __async_schedule(func, data, domain);
233} 229}
234EXPORT_SYMBOL_GPL(async_schedule_domain); 230EXPORT_SYMBOL_GPL(async_schedule_domain);
235 231
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f15..21c7fa615bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -49,6 +49,8 @@
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/err.h> 50#include <linux/err.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/kernel.h>
53#include <linux/syscalls.h>
52 54
53#include <linux/audit.h> 55#include <linux/audit.h>
54 56
@@ -58,7 +60,7 @@
58#ifdef CONFIG_SECURITY 60#ifdef CONFIG_SECURITY
59#include <linux/security.h> 61#include <linux/security.h>
60#endif 62#endif
61#include <linux/netlink.h> 63#include <net/netlink.h>
62#include <linux/freezer.h> 64#include <linux/freezer.h>
63#include <linux/tty.h> 65#include <linux/tty.h>
64#include <linux/pid_namespace.h> 66#include <linux/pid_namespace.h>
@@ -265,7 +267,6 @@ void audit_log_lost(const char *message)
265} 267}
266 268
267static int audit_log_config_change(char *function_name, int new, int old, 269static int audit_log_config_change(char *function_name, int new, int old,
268 kuid_t loginuid, u32 sessionid, u32 sid,
269 int allow_changes) 270 int allow_changes)
270{ 271{
271 struct audit_buffer *ab; 272 struct audit_buffer *ab;
@@ -274,29 +275,17 @@ static int audit_log_config_change(char *function_name, int new, int old,
274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 275 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
275 if (unlikely(!ab)) 276 if (unlikely(!ab))
276 return rc; 277 return rc;
277 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, 278 audit_log_format(ab, "%s=%d old=%d", function_name, new, old);
278 old, from_kuid(&init_user_ns, loginuid), sessionid); 279 audit_log_session_info(ab);
279 if (sid) { 280 rc = audit_log_task_context(ab);
280 char *ctx = NULL; 281 if (rc)
281 u32 len; 282 allow_changes = 0; /* Something weird, deny request */
282
283 rc = security_secid_to_secctx(sid, &ctx, &len);
284 if (rc) {
285 audit_log_format(ab, " sid=%u", sid);
286 allow_changes = 0; /* Something weird, deny request */
287 } else {
288 audit_log_format(ab, " subj=%s", ctx);
289 security_release_secctx(ctx, len);
290 }
291 }
292 audit_log_format(ab, " res=%d", allow_changes); 283 audit_log_format(ab, " res=%d", allow_changes);
293 audit_log_end(ab); 284 audit_log_end(ab);
294 return rc; 285 return rc;
295} 286}
296 287
297static int audit_do_config_change(char *function_name, int *to_change, 288static int audit_do_config_change(char *function_name, int *to_change, int new)
298 int new, kuid_t loginuid, u32 sessionid,
299 u32 sid)
300{ 289{
301 int allow_changes, rc = 0, old = *to_change; 290 int allow_changes, rc = 0, old = *to_change;
302 291
@@ -307,8 +296,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
307 allow_changes = 1; 296 allow_changes = 1;
308 297
309 if (audit_enabled != AUDIT_OFF) { 298 if (audit_enabled != AUDIT_OFF) {
310 rc = audit_log_config_change(function_name, new, old, loginuid, 299 rc = audit_log_config_change(function_name, new, old, allow_changes);
311 sessionid, sid, allow_changes);
312 if (rc) 300 if (rc)
313 allow_changes = 0; 301 allow_changes = 0;
314 } 302 }
@@ -322,44 +310,37 @@ static int audit_do_config_change(char *function_name, int *to_change,
322 return rc; 310 return rc;
323} 311}
324 312
325static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, 313static int audit_set_rate_limit(int limit)
326 u32 sid)
327{ 314{
328 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 315 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
329 limit, loginuid, sessionid, sid);
330} 316}
331 317
332static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, 318static int audit_set_backlog_limit(int limit)
333 u32 sid)
334{ 319{
335 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 320 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
336 limit, loginuid, sessionid, sid);
337} 321}
338 322
339static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) 323static int audit_set_enabled(int state)
340{ 324{
341 int rc; 325 int rc;
342 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 326 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
343 return -EINVAL; 327 return -EINVAL;
344 328
345 rc = audit_do_config_change("audit_enabled", &audit_enabled, state, 329 rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
346 loginuid, sessionid, sid);
347
348 if (!rc) 330 if (!rc)
349 audit_ever_enabled |= !!state; 331 audit_ever_enabled |= !!state;
350 332
351 return rc; 333 return rc;
352} 334}
353 335
354static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) 336static int audit_set_failure(int state)
355{ 337{
356 if (state != AUDIT_FAIL_SILENT 338 if (state != AUDIT_FAIL_SILENT
357 && state != AUDIT_FAIL_PRINTK 339 && state != AUDIT_FAIL_PRINTK
358 && state != AUDIT_FAIL_PANIC) 340 && state != AUDIT_FAIL_PANIC)
359 return -EINVAL; 341 return -EINVAL;
360 342
361 return audit_do_config_change("audit_failure", &audit_failure, state, 343 return audit_do_config_change("audit_failure", &audit_failure, state);
362 loginuid, sessionid, sid);
363} 344}
364 345
365/* 346/*
@@ -417,34 +398,53 @@ static void kauditd_send_skb(struct sk_buff *skb)
417 consume_skb(skb); 398 consume_skb(skb);
418} 399}
419 400
420static int kauditd_thread(void *dummy) 401/*
402 * flush_hold_queue - empty the hold queue if auditd appears
403 *
404 * If auditd just started, drain the queue of messages already
405 * sent to syslog/printk. Remember loss here is ok. We already
406 * called audit_log_lost() if it didn't go out normally. so the
407 * race between the skb_dequeue and the next check for audit_pid
408 * doesn't matter.
409 *
410 * If you ever find kauditd to be too slow we can get a perf win
411 * by doing our own locking and keeping better track if there
412 * are messages in this queue. I don't see the need now, but
413 * in 5 years when I want to play with this again I'll see this
414 * note and still have no friggin idea what i'm thinking today.
415 */
416static void flush_hold_queue(void)
421{ 417{
422 struct sk_buff *skb; 418 struct sk_buff *skb;
423 419
420 if (!audit_default || !audit_pid)
421 return;
422
423 skb = skb_dequeue(&audit_skb_hold_queue);
424 if (likely(!skb))
425 return;
426
427 while (skb && audit_pid) {
428 kauditd_send_skb(skb);
429 skb = skb_dequeue(&audit_skb_hold_queue);
430 }
431
432 /*
433 * if auditd just disappeared but we
434 * dequeued an skb we need to drop ref
435 */
436 if (skb)
437 consume_skb(skb);
438}
439
440static int kauditd_thread(void *dummy)
441{
424 set_freezable(); 442 set_freezable();
425 while (!kthread_should_stop()) { 443 while (!kthread_should_stop()) {
426 /* 444 struct sk_buff *skb;
427 * if auditd just started drain the queue of messages already 445 DECLARE_WAITQUEUE(wait, current);
428 * sent to syslog/printk. remember loss here is ok. we already 446
429 * called audit_log_lost() if it didn't go out normally. so the 447 flush_hold_queue();
430 * race between the skb_dequeue and the next check for audit_pid
431 * doesn't matter.
432 *
433 * if you ever find kauditd to be too slow we can get a perf win
434 * by doing our own locking and keeping better track if there
435 * are messages in this queue. I don't see the need now, but
436 * in 5 years when I want to play with this again I'll see this
437 * note and still have no friggin idea what i'm thinking today.
438 */
439 if (audit_default && audit_pid) {
440 skb = skb_dequeue(&audit_skb_hold_queue);
441 if (unlikely(skb)) {
442 while (skb && audit_pid) {
443 kauditd_send_skb(skb);
444 skb = skb_dequeue(&audit_skb_hold_queue);
445 }
446 }
447 }
448 448
449 skb = skb_dequeue(&audit_skb_queue); 449 skb = skb_dequeue(&audit_skb_queue);
450 wake_up(&audit_backlog_wait); 450 wake_up(&audit_backlog_wait);
@@ -453,19 +453,18 @@ static int kauditd_thread(void *dummy)
453 kauditd_send_skb(skb); 453 kauditd_send_skb(skb);
454 else 454 else
455 audit_printk_skb(skb); 455 audit_printk_skb(skb);
456 } else { 456 continue;
457 DECLARE_WAITQUEUE(wait, current); 457 }
458 set_current_state(TASK_INTERRUPTIBLE); 458 set_current_state(TASK_INTERRUPTIBLE);
459 add_wait_queue(&kauditd_wait, &wait); 459 add_wait_queue(&kauditd_wait, &wait);
460
461 if (!skb_queue_len(&audit_skb_queue)) {
462 try_to_freeze();
463 schedule();
464 }
465 460
466 __set_current_state(TASK_RUNNING); 461 if (!skb_queue_len(&audit_skb_queue)) {
467 remove_wait_queue(&kauditd_wait, &wait); 462 try_to_freeze();
463 schedule();
468 } 464 }
465
466 __set_current_state(TASK_RUNNING);
467 remove_wait_queue(&kauditd_wait, &wait);
469 } 468 }
470 return 0; 469 return 0;
471} 470}
@@ -579,13 +578,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
579 return -EPERM; 578 return -EPERM;
580 579
581 switch (msg_type) { 580 switch (msg_type) {
582 case AUDIT_GET:
583 case AUDIT_LIST: 581 case AUDIT_LIST:
584 case AUDIT_LIST_RULES:
585 case AUDIT_SET:
586 case AUDIT_ADD: 582 case AUDIT_ADD:
587 case AUDIT_ADD_RULE:
588 case AUDIT_DEL: 583 case AUDIT_DEL:
584 return -EOPNOTSUPP;
585 case AUDIT_GET:
586 case AUDIT_SET:
587 case AUDIT_LIST_RULES:
588 case AUDIT_ADD_RULE:
589 case AUDIT_DEL_RULE: 589 case AUDIT_DEL_RULE:
590 case AUDIT_SIGNAL_INFO: 590 case AUDIT_SIGNAL_INFO:
591 case AUDIT_TTY_GET: 591 case AUDIT_TTY_GET:
@@ -608,12 +608,10 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
608 return err; 608 return err;
609} 609}
610 610
611static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 611static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
612 kuid_t auid, u32 ses, u32 sid)
613{ 612{
614 int rc = 0; 613 int rc = 0;
615 char *ctx = NULL; 614 uid_t uid = from_kuid(&init_user_ns, current_uid());
616 u32 len;
617 615
618 if (!audit_enabled) { 616 if (!audit_enabled) {
619 *ab = NULL; 617 *ab = NULL;
@@ -623,33 +621,21 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
623 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 621 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
624 if (unlikely(!*ab)) 622 if (unlikely(!*ab))
625 return rc; 623 return rc;
626 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", 624 audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid);
627 task_tgid_vnr(current), 625 audit_log_session_info(*ab);
628 from_kuid(&init_user_ns, current_uid()), 626 audit_log_task_context(*ab);
629 from_kuid(&init_user_ns, auid), ses);
630 if (sid) {
631 rc = security_secid_to_secctx(sid, &ctx, &len);
632 if (rc)
633 audit_log_format(*ab, " ssid=%u", sid);
634 else {
635 audit_log_format(*ab, " subj=%s", ctx);
636 security_release_secctx(ctx, len);
637 }
638 }
639 627
640 return rc; 628 return rc;
641} 629}
642 630
643static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 631static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
644{ 632{
645 u32 seq, sid; 633 u32 seq;
646 void *data; 634 void *data;
647 struct audit_status *status_get, status_set; 635 struct audit_status *status_get, status_set;
648 int err; 636 int err;
649 struct audit_buffer *ab; 637 struct audit_buffer *ab;
650 u16 msg_type = nlh->nlmsg_type; 638 u16 msg_type = nlh->nlmsg_type;
651 kuid_t loginuid; /* loginuid of sender */
652 u32 sessionid;
653 struct audit_sig_info *sig_data; 639 struct audit_sig_info *sig_data;
654 char *ctx = NULL; 640 char *ctx = NULL;
655 u32 len; 641 u32 len;
@@ -660,17 +646,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
660 646
661 /* As soon as there's any sign of userspace auditd, 647 /* As soon as there's any sign of userspace auditd,
662 * start kauditd to talk to it */ 648 * start kauditd to talk to it */
663 if (!kauditd_task) 649 if (!kauditd_task) {
664 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); 650 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
665 if (IS_ERR(kauditd_task)) { 651 if (IS_ERR(kauditd_task)) {
666 err = PTR_ERR(kauditd_task); 652 err = PTR_ERR(kauditd_task);
667 kauditd_task = NULL; 653 kauditd_task = NULL;
668 return err; 654 return err;
655 }
669 } 656 }
670
671 loginuid = audit_get_loginuid(current);
672 sessionid = audit_get_sessionid(current);
673 security_task_getsecid(current, &sid);
674 seq = nlh->nlmsg_seq; 657 seq = nlh->nlmsg_seq;
675 data = nlmsg_data(nlh); 658 data = nlmsg_data(nlh);
676 659
@@ -691,14 +674,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
691 return -EINVAL; 674 return -EINVAL;
692 status_get = (struct audit_status *)data; 675 status_get = (struct audit_status *)data;
693 if (status_get->mask & AUDIT_STATUS_ENABLED) { 676 if (status_get->mask & AUDIT_STATUS_ENABLED) {
694 err = audit_set_enabled(status_get->enabled, 677 err = audit_set_enabled(status_get->enabled);
695 loginuid, sessionid, sid);
696 if (err < 0) 678 if (err < 0)
697 return err; 679 return err;
698 } 680 }
699 if (status_get->mask & AUDIT_STATUS_FAILURE) { 681 if (status_get->mask & AUDIT_STATUS_FAILURE) {
700 err = audit_set_failure(status_get->failure, 682 err = audit_set_failure(status_get->failure);
701 loginuid, sessionid, sid);
702 if (err < 0) 683 if (err < 0)
703 return err; 684 return err;
704 } 685 }
@@ -706,22 +687,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
706 int new_pid = status_get->pid; 687 int new_pid = status_get->pid;
707 688
708 if (audit_enabled != AUDIT_OFF) 689 if (audit_enabled != AUDIT_OFF)
709 audit_log_config_change("audit_pid", new_pid, 690 audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
710 audit_pid, loginuid,
711 sessionid, sid, 1);
712
713 audit_pid = new_pid; 691 audit_pid = new_pid;
714 audit_nlk_portid = NETLINK_CB(skb).portid; 692 audit_nlk_portid = NETLINK_CB(skb).portid;
715 } 693 }
716 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { 694 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
717 err = audit_set_rate_limit(status_get->rate_limit, 695 err = audit_set_rate_limit(status_get->rate_limit);
718 loginuid, sessionid, sid);
719 if (err < 0) 696 if (err < 0)
720 return err; 697 return err;
721 } 698 }
722 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 699 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
723 err = audit_set_backlog_limit(status_get->backlog_limit, 700 err = audit_set_backlog_limit(status_get->backlog_limit);
724 loginuid, sessionid, sid);
725 break; 701 break;
726 case AUDIT_USER: 702 case AUDIT_USER:
727 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 703 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -729,25 +705,22 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
729 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 705 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
730 return 0; 706 return 0;
731 707
732 err = audit_filter_user(); 708 err = audit_filter_user(msg_type);
733 if (err == 1) { 709 if (err == 1) {
734 err = 0; 710 err = 0;
735 if (msg_type == AUDIT_USER_TTY) { 711 if (msg_type == AUDIT_USER_TTY) {
736 err = tty_audit_push_task(current, loginuid, 712 err = tty_audit_push_current();
737 sessionid);
738 if (err) 713 if (err)
739 break; 714 break;
740 } 715 }
741 audit_log_common_recv_msg(&ab, msg_type, 716 audit_log_common_recv_msg(&ab, msg_type);
742 loginuid, sessionid, sid);
743
744 if (msg_type != AUDIT_USER_TTY) 717 if (msg_type != AUDIT_USER_TTY)
745 audit_log_format(ab, " msg='%.1024s'", 718 audit_log_format(ab, " msg='%.1024s'",
746 (char *)data); 719 (char *)data);
747 else { 720 else {
748 int size; 721 int size;
749 722
750 audit_log_format(ab, " msg="); 723 audit_log_format(ab, " data=");
751 size = nlmsg_len(nlh); 724 size = nlmsg_len(nlh);
752 if (size > 0 && 725 if (size > 0 &&
753 ((unsigned char *)data)[size - 1] == '\0') 726 ((unsigned char *)data)[size - 1] == '\0')
@@ -758,50 +731,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
758 audit_log_end(ab); 731 audit_log_end(ab);
759 } 732 }
760 break; 733 break;
761 case AUDIT_ADD:
762 case AUDIT_DEL:
763 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
764 return -EINVAL;
765 if (audit_enabled == AUDIT_LOCKED) {
766 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
767 loginuid, sessionid, sid);
768
769 audit_log_format(ab, " audit_enabled=%d res=0",
770 audit_enabled);
771 audit_log_end(ab);
772 return -EPERM;
773 }
774 /* fallthrough */
775 case AUDIT_LIST:
776 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
777 seq, data, nlmsg_len(nlh),
778 loginuid, sessionid, sid);
779 break;
780 case AUDIT_ADD_RULE: 734 case AUDIT_ADD_RULE:
781 case AUDIT_DEL_RULE: 735 case AUDIT_DEL_RULE:
782 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 736 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
783 return -EINVAL; 737 return -EINVAL;
784 if (audit_enabled == AUDIT_LOCKED) { 738 if (audit_enabled == AUDIT_LOCKED) {
785 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, 739 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
786 loginuid, sessionid, sid); 740 audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
787
788 audit_log_format(ab, " audit_enabled=%d res=0",
789 audit_enabled);
790 audit_log_end(ab); 741 audit_log_end(ab);
791 return -EPERM; 742 return -EPERM;
792 } 743 }
793 /* fallthrough */ 744 /* fallthrough */
794 case AUDIT_LIST_RULES: 745 case AUDIT_LIST_RULES:
795 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, 746 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
796 seq, data, nlmsg_len(nlh), 747 seq, data, nlmsg_len(nlh));
797 loginuid, sessionid, sid);
798 break; 748 break;
799 case AUDIT_TRIM: 749 case AUDIT_TRIM:
800 audit_trim_trees(); 750 audit_trim_trees();
801 751 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
802 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
803 loginuid, sessionid, sid);
804
805 audit_log_format(ab, " op=trim res=1"); 752 audit_log_format(ab, " op=trim res=1");
806 audit_log_end(ab); 753 audit_log_end(ab);
807 break; 754 break;
@@ -831,8 +778,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
831 /* OK, here comes... */ 778 /* OK, here comes... */
832 err = audit_tag_tree(old, new); 779 err = audit_tag_tree(old, new);
833 780
834 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, 781 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
835 loginuid, sessionid, sid);
836 782
837 audit_log_format(ab, " op=make_equiv old="); 783 audit_log_format(ab, " op=make_equiv old=");
838 audit_log_untrustedstring(ab, old); 784 audit_log_untrustedstring(ab, old);
@@ -871,27 +817,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
871 struct audit_tty_status s; 817 struct audit_tty_status s;
872 struct task_struct *tsk = current; 818 struct task_struct *tsk = current;
873 819
874 spin_lock_irq(&tsk->sighand->siglock); 820 spin_lock(&tsk->sighand->siglock);
875 s.enabled = tsk->signal->audit_tty != 0; 821 s.enabled = tsk->signal->audit_tty != 0;
876 spin_unlock_irq(&tsk->sighand->siglock); 822 s.log_passwd = tsk->signal->audit_tty_log_passwd;
823 spin_unlock(&tsk->sighand->siglock);
877 824
878 audit_send_reply(NETLINK_CB(skb).portid, seq, 825 audit_send_reply(NETLINK_CB(skb).portid, seq,
879 AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); 826 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
880 break; 827 break;
881 } 828 }
882 case AUDIT_TTY_SET: { 829 case AUDIT_TTY_SET: {
883 struct audit_tty_status *s; 830 struct audit_tty_status s;
884 struct task_struct *tsk = current; 831 struct task_struct *tsk = current;
885 832
886 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 833 memset(&s, 0, sizeof(s));
887 return -EINVAL; 834 /* guard against past and future API changes */
888 s = data; 835 memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len));
889 if (s->enabled != 0 && s->enabled != 1) 836 if ((s.enabled != 0 && s.enabled != 1) ||
837 (s.log_passwd != 0 && s.log_passwd != 1))
890 return -EINVAL; 838 return -EINVAL;
891 839
892 spin_lock_irq(&tsk->sighand->siglock); 840 spin_lock(&tsk->sighand->siglock);
893 tsk->signal->audit_tty = s->enabled != 0; 841 tsk->signal->audit_tty = s.enabled;
894 spin_unlock_irq(&tsk->sighand->siglock); 842 tsk->signal->audit_tty_log_passwd = s.log_passwd;
843 spin_unlock(&tsk->sighand->siglock);
895 break; 844 break;
896 } 845 }
897 default: 846 default:
@@ -910,7 +859,7 @@ static void audit_receive_skb(struct sk_buff *skb)
910{ 859{
911 struct nlmsghdr *nlh; 860 struct nlmsghdr *nlh;
912 /* 861 /*
913 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 862 * len MUST be signed for nlmsg_next to be able to dec it below 0
914 * if the nlmsg_len was not aligned 863 * if the nlmsg_len was not aligned
915 */ 864 */
916 int len; 865 int len;
@@ -919,13 +868,13 @@ static void audit_receive_skb(struct sk_buff *skb)
919 nlh = nlmsg_hdr(skb); 868 nlh = nlmsg_hdr(skb);
920 len = skb->len; 869 len = skb->len;
921 870
922 while (NLMSG_OK(nlh, len)) { 871 while (nlmsg_ok(nlh, len)) {
923 err = audit_receive_msg(skb, nlh); 872 err = audit_receive_msg(skb, nlh);
924 /* if err or if this message says it wants a response */ 873 /* if err or if this message says it wants a response */
925 if (err || (nlh->nlmsg_flags & NLM_F_ACK)) 874 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
926 netlink_ack(skb, nlh, err); 875 netlink_ack(skb, nlh, err);
927 876
928 nlh = NLMSG_NEXT(nlh, len); 877 nlh = nlmsg_next(nlh, &len);
929 } 878 }
930} 879}
931 880
@@ -1434,6 +1383,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1434 kfree(pathname); 1383 kfree(pathname);
1435} 1384}
1436 1385
1386void audit_log_session_info(struct audit_buffer *ab)
1387{
1388 u32 sessionid = audit_get_sessionid(current);
1389 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1390
1391 audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid);
1392}
1393
1437void audit_log_key(struct audit_buffer *ab, char *key) 1394void audit_log_key(struct audit_buffer *ab, char *key)
1438{ 1395{
1439 audit_log_format(ab, " key="); 1396 audit_log_format(ab, " key=");
@@ -1443,6 +1400,224 @@ void audit_log_key(struct audit_buffer *ab, char *key)
1443 audit_log_format(ab, "(null)"); 1400 audit_log_format(ab, "(null)");
1444} 1401}
1445 1402
1403void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1404{
1405 int i;
1406
1407 audit_log_format(ab, " %s=", prefix);
1408 CAP_FOR_EACH_U32(i) {
1409 audit_log_format(ab, "%08x",
1410 cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
1411 }
1412}
1413
1414void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1415{
1416 kernel_cap_t *perm = &name->fcap.permitted;
1417 kernel_cap_t *inh = &name->fcap.inheritable;
1418 int log = 0;
1419
1420 if (!cap_isclear(*perm)) {
1421 audit_log_cap(ab, "cap_fp", perm);
1422 log = 1;
1423 }
1424 if (!cap_isclear(*inh)) {
1425 audit_log_cap(ab, "cap_fi", inh);
1426 log = 1;
1427 }
1428
1429 if (log)
1430 audit_log_format(ab, " cap_fe=%d cap_fver=%x",
1431 name->fcap.fE, name->fcap_ver);
1432}
1433
1434static inline int audit_copy_fcaps(struct audit_names *name,
1435 const struct dentry *dentry)
1436{
1437 struct cpu_vfs_cap_data caps;
1438 int rc;
1439
1440 if (!dentry)
1441 return 0;
1442
1443 rc = get_vfs_caps_from_disk(dentry, &caps);
1444 if (rc)
1445 return rc;
1446
1447 name->fcap.permitted = caps.permitted;
1448 name->fcap.inheritable = caps.inheritable;
1449 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
1450 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
1451 VFS_CAP_REVISION_SHIFT;
1452
1453 return 0;
1454}
1455
1456/* Copy inode data into an audit_names. */
1457void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
1458 const struct inode *inode)
1459{
1460 name->ino = inode->i_ino;
1461 name->dev = inode->i_sb->s_dev;
1462 name->mode = inode->i_mode;
1463 name->uid = inode->i_uid;
1464 name->gid = inode->i_gid;
1465 name->rdev = inode->i_rdev;
1466 security_inode_getsecid(inode, &name->osid);
1467 audit_copy_fcaps(name, dentry);
1468}
1469
1470/**
1471 * audit_log_name - produce AUDIT_PATH record from struct audit_names
1472 * @context: audit_context for the task
1473 * @n: audit_names structure with reportable details
1474 * @path: optional path to report instead of audit_names->name
1475 * @record_num: record number to report when handling a list of names
1476 * @call_panic: optional pointer to int that will be updated if secid fails
1477 */
1478void audit_log_name(struct audit_context *context, struct audit_names *n,
1479 struct path *path, int record_num, int *call_panic)
1480{
1481 struct audit_buffer *ab;
1482 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1483 if (!ab)
1484 return;
1485
1486 audit_log_format(ab, "item=%d", record_num);
1487
1488 if (path)
1489 audit_log_d_path(ab, " name=", path);
1490 else if (n->name) {
1491 switch (n->name_len) {
1492 case AUDIT_NAME_FULL:
1493 /* log the full path */
1494 audit_log_format(ab, " name=");
1495 audit_log_untrustedstring(ab, n->name->name);
1496 break;
1497 case 0:
1498 /* name was specified as a relative path and the
1499 * directory component is the cwd */
1500 audit_log_d_path(ab, " name=", &context->pwd);
1501 break;
1502 default:
1503 /* log the name's directory component */
1504 audit_log_format(ab, " name=");
1505 audit_log_n_untrustedstring(ab, n->name->name,
1506 n->name_len);
1507 }
1508 } else
1509 audit_log_format(ab, " name=(null)");
1510
1511 if (n->ino != (unsigned long)-1) {
1512 audit_log_format(ab, " inode=%lu"
1513 " dev=%02x:%02x mode=%#ho"
1514 " ouid=%u ogid=%u rdev=%02x:%02x",
1515 n->ino,
1516 MAJOR(n->dev),
1517 MINOR(n->dev),
1518 n->mode,
1519 from_kuid(&init_user_ns, n->uid),
1520 from_kgid(&init_user_ns, n->gid),
1521 MAJOR(n->rdev),
1522 MINOR(n->rdev));
1523 }
1524 if (n->osid != 0) {
1525 char *ctx = NULL;
1526 u32 len;
1527 if (security_secid_to_secctx(
1528 n->osid, &ctx, &len)) {
1529 audit_log_format(ab, " osid=%u", n->osid);
1530 if (call_panic)
1531 *call_panic = 2;
1532 } else {
1533 audit_log_format(ab, " obj=%s", ctx);
1534 security_release_secctx(ctx, len);
1535 }
1536 }
1537
1538 audit_log_fcaps(ab, n);
1539 audit_log_end(ab);
1540}
1541
1542int audit_log_task_context(struct audit_buffer *ab)
1543{
1544 char *ctx = NULL;
1545 unsigned len;
1546 int error;
1547 u32 sid;
1548
1549 security_task_getsecid(current, &sid);
1550 if (!sid)
1551 return 0;
1552
1553 error = security_secid_to_secctx(sid, &ctx, &len);
1554 if (error) {
1555 if (error != -EINVAL)
1556 goto error_path;
1557 return 0;
1558 }
1559
1560 audit_log_format(ab, " subj=%s", ctx);
1561 security_release_secctx(ctx, len);
1562 return 0;
1563
1564error_path:
1565 audit_panic("error in audit_log_task_context");
1566 return error;
1567}
1568EXPORT_SYMBOL(audit_log_task_context);
1569
1570void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1571{
1572 const struct cred *cred;
1573 char name[sizeof(tsk->comm)];
1574 struct mm_struct *mm = tsk->mm;
1575 char *tty;
1576
1577 if (!ab)
1578 return;
1579
1580 /* tsk == current */
1581 cred = current_cred();
1582
1583 spin_lock_irq(&tsk->sighand->siglock);
1584 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1585 tty = tsk->signal->tty->name;
1586 else
1587 tty = "(none)";
1588 spin_unlock_irq(&tsk->sighand->siglock);
1589
1590 audit_log_format(ab,
1591 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1592 " euid=%u suid=%u fsuid=%u"
1593 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
1594 sys_getppid(),
1595 tsk->pid,
1596 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
1597 from_kuid(&init_user_ns, cred->uid),
1598 from_kgid(&init_user_ns, cred->gid),
1599 from_kuid(&init_user_ns, cred->euid),
1600 from_kuid(&init_user_ns, cred->suid),
1601 from_kuid(&init_user_ns, cred->fsuid),
1602 from_kgid(&init_user_ns, cred->egid),
1603 from_kgid(&init_user_ns, cred->sgid),
1604 from_kgid(&init_user_ns, cred->fsgid),
1605 audit_get_sessionid(tsk), tty);
1606
1607 get_task_comm(name, tsk);
1608 audit_log_format(ab, " comm=");
1609 audit_log_untrustedstring(ab, name);
1610
1611 if (mm) {
1612 down_read(&mm->mmap_sem);
1613 if (mm->exe_file)
1614 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1615 up_read(&mm->mmap_sem);
1616 }
1617 audit_log_task_context(ab);
1618}
1619EXPORT_SYMBOL(audit_log_task_info);
1620
1446/** 1621/**
1447 * audit_log_link_denied - report a link restriction denial 1622 * audit_log_link_denied - report a link restriction denial
1448 * @operation: specific link opreation 1623 * @operation: specific link opreation
@@ -1451,19 +1626,28 @@ void audit_log_key(struct audit_buffer *ab, char *key)
1451void audit_log_link_denied(const char *operation, struct path *link) 1626void audit_log_link_denied(const char *operation, struct path *link)
1452{ 1627{
1453 struct audit_buffer *ab; 1628 struct audit_buffer *ab;
1629 struct audit_names *name;
1630
1631 name = kzalloc(sizeof(*name), GFP_NOFS);
1632 if (!name)
1633 return;
1454 1634
1635 /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
1455 ab = audit_log_start(current->audit_context, GFP_KERNEL, 1636 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1456 AUDIT_ANOM_LINK); 1637 AUDIT_ANOM_LINK);
1457 if (!ab) 1638 if (!ab)
1458 return; 1639 goto out;
1459 audit_log_format(ab, "op=%s action=denied", operation); 1640 audit_log_format(ab, "op=%s", operation);
1460 audit_log_format(ab, " pid=%d comm=", current->pid); 1641 audit_log_task_info(ab, current);
1461 audit_log_untrustedstring(ab, current->comm); 1642 audit_log_format(ab, " res=0");
1462 audit_log_d_path(ab, " path=", link);
1463 audit_log_format(ab, " dev=");
1464 audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
1465 audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
1466 audit_log_end(ab); 1643 audit_log_end(ab);
1644
1645 /* Generate AUDIT_PATH record with object. */
1646 name->type = AUDIT_TYPE_NORMAL;
1647 audit_copy_inode(name, link->dentry, link->dentry->d_inode);
1648 audit_log_name(current->audit_context, name, link, 0, NULL);
1649out:
1650 kfree(name);
1467} 1651}
1468 1652
1469/** 1653/**
@@ -1483,7 +1667,7 @@ void audit_log_end(struct audit_buffer *ab)
1483 audit_log_lost("rate limit exceeded"); 1667 audit_log_lost("rate limit exceeded");
1484 } else { 1668 } else {
1485 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1669 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1486 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); 1670 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
1487 1671
1488 if (audit_pid) { 1672 if (audit_pid) {
1489 skb_queue_tail(&audit_skb_queue, ab->skb); 1673 skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1b..1c95131ef760 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <uapi/linux/mqueue.h>
25 26
26/* 0 = no checking 27/* 0 = no checking
27 1 = put_count checking 28 1 = put_count checking
@@ -29,6 +30,11 @@
29*/ 30*/
30#define AUDIT_DEBUG 0 31#define AUDIT_DEBUG 0
31 32
33/* AUDIT_NAMES is the number of slots we reserve in the audit_context
34 * for saving names from getname(). If we get more names we will allocate
35 * a name dynamically and also add those to the list anchored by names_list. */
36#define AUDIT_NAMES 5
37
32/* At task start time, the audit_state is set in the audit_context using 38/* At task start time, the audit_state is set in the audit_context using
33 a per-task filter. At syscall entry, the audit_state is augmented by 39 a per-task filter. At syscall entry, the audit_state is augmented by
34 the syscall filter. */ 40 the syscall filter. */
@@ -59,10 +65,157 @@ struct audit_entry {
59 struct audit_krule rule; 65 struct audit_krule rule;
60}; 66};
61 67
62#ifdef CONFIG_AUDIT 68struct audit_cap_data {
63extern int audit_enabled; 69 kernel_cap_t permitted;
64extern int audit_ever_enabled; 70 kernel_cap_t inheritable;
71 union {
72 unsigned int fE; /* effective bit of file cap */
73 kernel_cap_t effective; /* effective set of process */
74 };
75};
76
77/* When fs/namei.c:getname() is called, we store the pointer in name and
78 * we don't let putname() free it (instead we free all of the saved
79 * pointers at syscall exit time).
80 *
81 * Further, in fs/namei.c:path_lookup() we store the inode and device.
82 */
83struct audit_names {
84 struct list_head list; /* audit_context->names_list */
85
86 struct filename *name;
87 int name_len; /* number of chars to log */
88 bool name_put; /* call __putname()? */
89
90 unsigned long ino;
91 dev_t dev;
92 umode_t mode;
93 kuid_t uid;
94 kgid_t gid;
95 dev_t rdev;
96 u32 osid;
97 struct audit_cap_data fcap;
98 unsigned int fcap_ver;
99 unsigned char type; /* record type */
100 /*
101 * This was an allocated audit_names and not from the array of
102 * names allocated in the task audit context. Thus this name
103 * should be freed on syscall exit.
104 */
105 bool should_free;
106};
107
108/* The per-task audit context. */
109struct audit_context {
110 int dummy; /* must be the first element */
111 int in_syscall; /* 1 if task is in a syscall */
112 enum audit_state state, current_state;
113 unsigned int serial; /* serial number for record */
114 int major; /* syscall number */
115 struct timespec ctime; /* time of syscall entry */
116 unsigned long argv[4]; /* syscall arguments */
117 long return_code;/* syscall return code */
118 u64 prio;
119 int return_valid; /* return code is valid */
120 /*
121 * The names_list is the list of all audit_names collected during this
122 * syscall. The first AUDIT_NAMES entries in the names_list will
123 * actually be from the preallocated_names array for performance
124 * reasons. Except during allocation they should never be referenced
125 * through the preallocated_names array and should only be found/used
126 * by running the names_list.
127 */
128 struct audit_names preallocated_names[AUDIT_NAMES];
129 int name_count; /* total records in names_list */
130 struct list_head names_list; /* struct audit_names->list anchor */
131 char *filterkey; /* key for rule that triggered record */
132 struct path pwd;
133 struct audit_aux_data *aux;
134 struct audit_aux_data *aux_pids;
135 struct sockaddr_storage *sockaddr;
136 size_t sockaddr_len;
137 /* Save things to print about task_struct */
138 pid_t pid, ppid;
139 kuid_t uid, euid, suid, fsuid;
140 kgid_t gid, egid, sgid, fsgid;
141 unsigned long personality;
142 int arch;
143
144 pid_t target_pid;
145 kuid_t target_auid;
146 kuid_t target_uid;
147 unsigned int target_sessionid;
148 u32 target_sid;
149 char target_comm[TASK_COMM_LEN];
150
151 struct audit_tree_refs *trees, *first_trees;
152 struct list_head killed_trees;
153 int tree_count;
154
155 int type;
156 union {
157 struct {
158 int nargs;
159 long args[6];
160 } socketcall;
161 struct {
162 kuid_t uid;
163 kgid_t gid;
164 umode_t mode;
165 u32 osid;
166 int has_perm;
167 uid_t perm_uid;
168 gid_t perm_gid;
169 umode_t perm_mode;
170 unsigned long qbytes;
171 } ipc;
172 struct {
173 mqd_t mqdes;
174 struct mq_attr mqstat;
175 } mq_getsetattr;
176 struct {
177 mqd_t mqdes;
178 int sigev_signo;
179 } mq_notify;
180 struct {
181 mqd_t mqdes;
182 size_t msg_len;
183 unsigned int msg_prio;
184 struct timespec abs_timeout;
185 } mq_sendrecv;
186 struct {
187 int oflag;
188 umode_t mode;
189 struct mq_attr attr;
190 } mq_open;
191 struct {
192 pid_t pid;
193 struct audit_cap_data cap;
194 } capset;
195 struct {
196 int fd;
197 int flags;
198 } mmap;
199 };
200 int fds[2];
201
202#if AUDIT_DEBUG
203 int put_count;
204 int ino_count;
65#endif 205#endif
206};
207
208extern int audit_ever_enabled;
209
210extern void audit_copy_inode(struct audit_names *name,
211 const struct dentry *dentry,
212 const struct inode *inode);
213extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
214 kernel_cap_t *cap);
215extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
216extern void audit_log_name(struct audit_context *context,
217 struct audit_names *n, struct path *path,
218 int record_num, int *call_panic);
66 219
67extern int audit_pid; 220extern int audit_pid;
68 221
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
617 } 617 }
618 spin_unlock(&hash_lock); 618 spin_unlock(&hash_lock);
619 trim_marked(tree); 619 trim_marked(tree);
620 put_tree(tree);
621 drop_collected_mounts(root_mnt); 620 drop_collected_mounts(root_mnt);
622skip_it: 621skip_it:
622 put_tree(tree);
623 mutex_lock(&audit_filter_mutex); 623 mutex_lock(&audit_filter_mutex);
624 } 624 }
625 list_del(&cursor); 625 list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..6bd4a90d1991 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -310,121 +310,83 @@ static u32 audit_to_op(u32 op)
310 return n; 310 return n;
311} 311}
312 312
313 313/* check if an audit field is valid */
314/* Translate struct audit_rule to kernel's rule respresentation. 314static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
315 * Exists for backward compatibility with userspace. */
316static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
317{ 315{
318 struct audit_entry *entry; 316 switch(f->type) {
319 int err = 0; 317 case AUDIT_MSGTYPE:
320 int i; 318 if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
321 319 entry->rule.listnr != AUDIT_FILTER_USER)
322 entry = audit_to_entry_common(rule); 320 return -EINVAL;
323 if (IS_ERR(entry)) 321 break;
324 goto exit_nofree; 322 };
325
326 for (i = 0; i < rule->field_count; i++) {
327 struct audit_field *f = &entry->rule.fields[i];
328 u32 n;
329
330 n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
331
332 /* Support for legacy operators where
333 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
334 if (n & AUDIT_NEGATE)
335 f->op = Audit_not_equal;
336 else if (!n)
337 f->op = Audit_equal;
338 else
339 f->op = audit_to_op(n);
340
341 entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
342
343 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
344 f->val = rule->values[i];
345 f->uid = INVALID_UID;
346 f->gid = INVALID_GID;
347
348 err = -EINVAL;
349 if (f->op == Audit_bad)
350 goto exit_free;
351
352 switch(f->type) {
353 default:
354 goto exit_free;
355 case AUDIT_UID:
356 case AUDIT_EUID:
357 case AUDIT_SUID:
358 case AUDIT_FSUID:
359 case AUDIT_LOGINUID:
360 /* bit ops not implemented for uid comparisons */
361 if (f->op == Audit_bitmask || f->op == Audit_bittest)
362 goto exit_free;
363
364 f->uid = make_kuid(current_user_ns(), f->val);
365 if (!uid_valid(f->uid))
366 goto exit_free;
367 break;
368 case AUDIT_GID:
369 case AUDIT_EGID:
370 case AUDIT_SGID:
371 case AUDIT_FSGID:
372 /* bit ops not implemented for gid comparisons */
373 if (f->op == Audit_bitmask || f->op == Audit_bittest)
374 goto exit_free;
375
376 f->gid = make_kgid(current_user_ns(), f->val);
377 if (!gid_valid(f->gid))
378 goto exit_free;
379 break;
380 case AUDIT_PID:
381 case AUDIT_PERS:
382 case AUDIT_MSGTYPE:
383 case AUDIT_PPID:
384 case AUDIT_DEVMAJOR:
385 case AUDIT_DEVMINOR:
386 case AUDIT_EXIT:
387 case AUDIT_SUCCESS:
388 /* bit ops are only useful on syscall args */
389 if (f->op == Audit_bitmask || f->op == Audit_bittest)
390 goto exit_free;
391 break;
392 case AUDIT_ARG0:
393 case AUDIT_ARG1:
394 case AUDIT_ARG2:
395 case AUDIT_ARG3:
396 break;
397 /* arch is only allowed to be = or != */
398 case AUDIT_ARCH:
399 if (f->op != Audit_not_equal && f->op != Audit_equal)
400 goto exit_free;
401 entry->rule.arch_f = f;
402 break;
403 case AUDIT_PERM:
404 if (f->val & ~15)
405 goto exit_free;
406 break;
407 case AUDIT_FILETYPE:
408 if (f->val & ~S_IFMT)
409 goto exit_free;
410 break;
411 case AUDIT_INODE:
412 err = audit_to_inode(&entry->rule, f);
413 if (err)
414 goto exit_free;
415 break;
416 }
417 }
418
419 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
420 entry->rule.inode_f = NULL;
421
422exit_nofree:
423 return entry;
424 323
425exit_free: 324 switch(f->type) {
426 audit_free_rule(entry); 325 default:
427 return ERR_PTR(err); 326 return -EINVAL;
327 case AUDIT_UID:
328 case AUDIT_EUID:
329 case AUDIT_SUID:
330 case AUDIT_FSUID:
331 case AUDIT_LOGINUID:
332 case AUDIT_OBJ_UID:
333 case AUDIT_GID:
334 case AUDIT_EGID:
335 case AUDIT_SGID:
336 case AUDIT_FSGID:
337 case AUDIT_OBJ_GID:
338 case AUDIT_PID:
339 case AUDIT_PERS:
340 case AUDIT_MSGTYPE:
341 case AUDIT_PPID:
342 case AUDIT_DEVMAJOR:
343 case AUDIT_DEVMINOR:
344 case AUDIT_EXIT:
345 case AUDIT_SUCCESS:
346 /* bit ops are only useful on syscall args */
347 if (f->op == Audit_bitmask || f->op == Audit_bittest)
348 return -EINVAL;
349 break;
350 case AUDIT_ARG0:
351 case AUDIT_ARG1:
352 case AUDIT_ARG2:
353 case AUDIT_ARG3:
354 case AUDIT_SUBJ_USER:
355 case AUDIT_SUBJ_ROLE:
356 case AUDIT_SUBJ_TYPE:
357 case AUDIT_SUBJ_SEN:
358 case AUDIT_SUBJ_CLR:
359 case AUDIT_OBJ_USER:
360 case AUDIT_OBJ_ROLE:
361 case AUDIT_OBJ_TYPE:
362 case AUDIT_OBJ_LEV_LOW:
363 case AUDIT_OBJ_LEV_HIGH:
364 case AUDIT_WATCH:
365 case AUDIT_DIR:
366 case AUDIT_FILTERKEY:
367 break;
368 case AUDIT_LOGINUID_SET:
369 if ((f->val != 0) && (f->val != 1))
370 return -EINVAL;
371 /* FALL THROUGH */
372 case AUDIT_ARCH:
373 if (f->op != Audit_not_equal && f->op != Audit_equal)
374 return -EINVAL;
375 break;
376 case AUDIT_PERM:
377 if (f->val & ~15)
378 return -EINVAL;
379 break;
380 case AUDIT_FILETYPE:
381 if (f->val & ~S_IFMT)
382 return -EINVAL;
383 break;
384 case AUDIT_FIELD_COMPARE:
385 if (f->val > AUDIT_MAX_FIELD_COMPARE)
386 return -EINVAL;
387 break;
388 };
389 return 0;
428} 390}
429 391
430/* Translate struct audit_rule_data to kernel's rule respresentation. */ 392/* Translate struct audit_rule_data to kernel's rule respresentation. */
@@ -459,17 +421,25 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
459 f->gid = INVALID_GID; 421 f->gid = INVALID_GID;
460 f->lsm_str = NULL; 422 f->lsm_str = NULL;
461 f->lsm_rule = NULL; 423 f->lsm_rule = NULL;
462 switch(f->type) { 424
425 /* Support legacy tests for a valid loginuid */
426 if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {
427 f->type = AUDIT_LOGINUID_SET;
428 f->val = 0;
429 }
430
431 err = audit_field_valid(entry, f);
432 if (err)
433 goto exit_free;
434
435 err = -EINVAL;
436 switch (f->type) {
437 case AUDIT_LOGINUID:
463 case AUDIT_UID: 438 case AUDIT_UID:
464 case AUDIT_EUID: 439 case AUDIT_EUID:
465 case AUDIT_SUID: 440 case AUDIT_SUID:
466 case AUDIT_FSUID: 441 case AUDIT_FSUID:
467 case AUDIT_LOGINUID:
468 case AUDIT_OBJ_UID: 442 case AUDIT_OBJ_UID:
469 /* bit ops not implemented for uid comparisons */
470 if (f->op == Audit_bitmask || f->op == Audit_bittest)
471 goto exit_free;
472
473 f->uid = make_kuid(current_user_ns(), f->val); 443 f->uid = make_kuid(current_user_ns(), f->val);
474 if (!uid_valid(f->uid)) 444 if (!uid_valid(f->uid))
475 goto exit_free; 445 goto exit_free;
@@ -479,27 +449,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
479 case AUDIT_SGID: 449 case AUDIT_SGID:
480 case AUDIT_FSGID: 450 case AUDIT_FSGID:
481 case AUDIT_OBJ_GID: 451 case AUDIT_OBJ_GID:
482 /* bit ops not implemented for gid comparisons */
483 if (f->op == Audit_bitmask || f->op == Audit_bittest)
484 goto exit_free;
485
486 f->gid = make_kgid(current_user_ns(), f->val); 452 f->gid = make_kgid(current_user_ns(), f->val);
487 if (!gid_valid(f->gid)) 453 if (!gid_valid(f->gid))
488 goto exit_free; 454 goto exit_free;
489 break; 455 break;
490 case AUDIT_PID:
491 case AUDIT_PERS:
492 case AUDIT_MSGTYPE:
493 case AUDIT_PPID:
494 case AUDIT_DEVMAJOR:
495 case AUDIT_DEVMINOR:
496 case AUDIT_EXIT:
497 case AUDIT_SUCCESS:
498 case AUDIT_ARG0:
499 case AUDIT_ARG1:
500 case AUDIT_ARG2:
501 case AUDIT_ARG3:
502 break;
503 case AUDIT_ARCH: 456 case AUDIT_ARCH:
504 entry->rule.arch_f = f; 457 entry->rule.arch_f = f;
505 break; 458 break;
@@ -570,20 +523,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
570 entry->rule.buflen += f->val; 523 entry->rule.buflen += f->val;
571 entry->rule.filterkey = str; 524 entry->rule.filterkey = str;
572 break; 525 break;
573 case AUDIT_PERM:
574 if (f->val & ~15)
575 goto exit_free;
576 break;
577 case AUDIT_FILETYPE:
578 if (f->val & ~S_IFMT)
579 goto exit_free;
580 break;
581 case AUDIT_FIELD_COMPARE:
582 if (f->val > AUDIT_MAX_FIELD_COMPARE)
583 goto exit_free;
584 break;
585 default:
586 goto exit_free;
587 } 526 }
588 } 527 }
589 528
@@ -594,6 +533,10 @@ exit_nofree:
594 return entry; 533 return entry;
595 534
596exit_free: 535exit_free:
536 if (entry->rule.watch)
537 audit_put_watch(entry->rule.watch); /* matches initial get */
538 if (entry->rule.tree)
539 audit_put_tree(entry->rule.tree); /* that's the temporary one */
597 audit_free_rule(entry); 540 audit_free_rule(entry);
598 return ERR_PTR(err); 541 return ERR_PTR(err);
599} 542}
@@ -609,36 +552,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
609 return len; 552 return len;
610} 553}
611 554
612/* Translate kernel rule respresentation to struct audit_rule.
613 * Exists for backward compatibility with userspace. */
614static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
615{
616 struct audit_rule *rule;
617 int i;
618
619 rule = kzalloc(sizeof(*rule), GFP_KERNEL);
620 if (unlikely(!rule))
621 return NULL;
622
623 rule->flags = krule->flags | krule->listnr;
624 rule->action = krule->action;
625 rule->field_count = krule->field_count;
626 for (i = 0; i < rule->field_count; i++) {
627 rule->values[i] = krule->fields[i].val;
628 rule->fields[i] = krule->fields[i].type;
629
630 if (krule->vers_ops == 1) {
631 if (krule->fields[i].op == Audit_not_equal)
632 rule->fields[i] |= AUDIT_NEGATE;
633 } else {
634 rule->fields[i] |= audit_ops[krule->fields[i].op];
635 }
636 }
637 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
638
639 return rule;
640}
641
642/* Translate kernel rule respresentation to struct audit_rule_data. */ 555/* Translate kernel rule respresentation to struct audit_rule_data. */
643static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) 556static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
644{ 557{
@@ -1051,35 +964,6 @@ out:
1051 return ret; 964 return ret;
1052} 965}
1053 966
1054/* List rules using struct audit_rule. Exists for backward
1055 * compatibility with userspace. */
1056static void audit_list(int pid, int seq, struct sk_buff_head *q)
1057{
1058 struct sk_buff *skb;
1059 struct audit_krule *r;
1060 int i;
1061
1062 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1063 * iterator to sync with list writers. */
1064 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1065 list_for_each_entry(r, &audit_rules_list[i], list) {
1066 struct audit_rule *rule;
1067
1068 rule = audit_krule_to_rule(r);
1069 if (unlikely(!rule))
1070 break;
1071 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1072 rule, sizeof(*rule));
1073 if (skb)
1074 skb_queue_tail(q, skb);
1075 kfree(rule);
1076 }
1077 }
1078 skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
1079 if (skb)
1080 skb_queue_tail(q, skb);
1081}
1082
1083/* List rules using struct audit_rule_data. */ 967/* List rules using struct audit_rule_data. */
1084static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 968static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1085{ 969{
@@ -1109,11 +993,11 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1109} 993}
1110 994
1111/* Log rule additions and removals */ 995/* Log rule additions and removals */
1112static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, 996static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
1113 char *action, struct audit_krule *rule,
1114 int res)
1115{ 997{
1116 struct audit_buffer *ab; 998 struct audit_buffer *ab;
999 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1000 u32 sessionid = audit_get_sessionid(current);
1117 1001
1118 if (!audit_enabled) 1002 if (!audit_enabled)
1119 return; 1003 return;
@@ -1121,18 +1005,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1121 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1005 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1122 if (!ab) 1006 if (!ab)
1123 return; 1007 return;
1124 audit_log_format(ab, "auid=%u ses=%u", 1008 audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
1125 from_kuid(&init_user_ns, loginuid), sessionid); 1009 audit_log_task_context(ab);
1126 if (sid) {
1127 char *ctx = NULL;
1128 u32 len;
1129 if (security_secid_to_secctx(sid, &ctx, &len))
1130 audit_log_format(ab, " ssid=%u", sid);
1131 else {
1132 audit_log_format(ab, " subj=%s", ctx);
1133 security_release_secctx(ctx, len);
1134 }
1135 }
1136 audit_log_format(ab, " op="); 1010 audit_log_format(ab, " op=");
1137 audit_log_string(ab, action); 1011 audit_log_string(ab, action);
1138 audit_log_key(ab, rule->filterkey); 1012 audit_log_key(ab, rule->filterkey);
@@ -1147,12 +1021,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1147 * @seq: netlink audit message sequence (serial) number 1021 * @seq: netlink audit message sequence (serial) number
1148 * @data: payload data 1022 * @data: payload data
1149 * @datasz: size of payload data 1023 * @datasz: size of payload data
1150 * @loginuid: loginuid of sender
1151 * @sessionid: sessionid for netlink audit message
1152 * @sid: SE Linux Security ID of sender
1153 */ 1024 */
1154int audit_receive_filter(int type, int pid, int seq, void *data, 1025int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
1155 size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
1156{ 1026{
1157 struct task_struct *tsk; 1027 struct task_struct *tsk;
1158 struct audit_netlink_list *dest; 1028 struct audit_netlink_list *dest;
@@ -1160,7 +1030,6 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
1160 struct audit_entry *entry; 1030 struct audit_entry *entry;
1161 1031
1162 switch (type) { 1032 switch (type) {
1163 case AUDIT_LIST:
1164 case AUDIT_LIST_RULES: 1033 case AUDIT_LIST_RULES:
1165 /* We can't just spew out the rules here because we might fill 1034 /* We can't just spew out the rules here because we might fill
1166 * the available socket buffer space and deadlock waiting for 1035 * the available socket buffer space and deadlock waiting for
@@ -1175,10 +1044,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
1175 skb_queue_head_init(&dest->q); 1044 skb_queue_head_init(&dest->q);
1176 1045
1177 mutex_lock(&audit_filter_mutex); 1046 mutex_lock(&audit_filter_mutex);
1178 if (type == AUDIT_LIST) 1047 audit_list_rules(pid, seq, &dest->q);
1179 audit_list(pid, seq, &dest->q);
1180 else
1181 audit_list_rules(pid, seq, &dest->q);
1182 mutex_unlock(&audit_filter_mutex); 1048 mutex_unlock(&audit_filter_mutex);
1183 1049
1184 tsk = kthread_run(audit_send_list, dest, "audit_send_list"); 1050 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
@@ -1188,35 +1054,23 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
1188 err = PTR_ERR(tsk); 1054 err = PTR_ERR(tsk);
1189 } 1055 }
1190 break; 1056 break;
1191 case AUDIT_ADD:
1192 case AUDIT_ADD_RULE: 1057 case AUDIT_ADD_RULE:
1193 if (type == AUDIT_ADD) 1058 entry = audit_data_to_entry(data, datasz);
1194 entry = audit_rule_to_entry(data);
1195 else
1196 entry = audit_data_to_entry(data, datasz);
1197 if (IS_ERR(entry)) 1059 if (IS_ERR(entry))
1198 return PTR_ERR(entry); 1060 return PTR_ERR(entry);
1199 1061
1200 err = audit_add_rule(entry); 1062 err = audit_add_rule(entry);
1201 audit_log_rule_change(loginuid, sessionid, sid, "add rule", 1063 audit_log_rule_change("add rule", &entry->rule, !err);
1202 &entry->rule, !err);
1203
1204 if (err) 1064 if (err)
1205 audit_free_rule(entry); 1065 audit_free_rule(entry);
1206 break; 1066 break;
1207 case AUDIT_DEL:
1208 case AUDIT_DEL_RULE: 1067 case AUDIT_DEL_RULE:
1209 if (type == AUDIT_DEL) 1068 entry = audit_data_to_entry(data, datasz);
1210 entry = audit_rule_to_entry(data);
1211 else
1212 entry = audit_data_to_entry(data, datasz);
1213 if (IS_ERR(entry)) 1069 if (IS_ERR(entry))
1214 return PTR_ERR(entry); 1070 return PTR_ERR(entry);
1215 1071
1216 err = audit_del_rule(entry); 1072 err = audit_del_rule(entry);
1217 audit_log_rule_change(loginuid, sessionid, sid, "remove rule", 1073 audit_log_rule_change("remove rule", &entry->rule, !err);
1218 &entry->rule, !err);
1219
1220 audit_free_rule(entry); 1074 audit_free_rule(entry);
1221 break; 1075 break;
1222 default: 1076 default:
@@ -1354,7 +1208,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
1354 return strncmp(p, dname, dlen); 1208 return strncmp(p, dname, dlen);
1355} 1209}
1356 1210
1357static int audit_filter_user_rules(struct audit_krule *rule, 1211static int audit_filter_user_rules(struct audit_krule *rule, int type,
1358 enum audit_state *state) 1212 enum audit_state *state)
1359{ 1213{
1360 int i; 1214 int i;
@@ -1378,6 +1232,13 @@ static int audit_filter_user_rules(struct audit_krule *rule,
1378 result = audit_uid_comparator(audit_get_loginuid(current), 1232 result = audit_uid_comparator(audit_get_loginuid(current),
1379 f->op, f->uid); 1233 f->op, f->uid);
1380 break; 1234 break;
1235 case AUDIT_LOGINUID_SET:
1236 result = audit_comparator(audit_loginuid_set(current),
1237 f->op, f->val);
1238 break;
1239 case AUDIT_MSGTYPE:
1240 result = audit_comparator(type, f->op, f->val);
1241 break;
1381 case AUDIT_SUBJ_USER: 1242 case AUDIT_SUBJ_USER:
1382 case AUDIT_SUBJ_ROLE: 1243 case AUDIT_SUBJ_ROLE:
1383 case AUDIT_SUBJ_TYPE: 1244 case AUDIT_SUBJ_TYPE:
@@ -1404,7 +1265,7 @@ static int audit_filter_user_rules(struct audit_krule *rule,
1404 return 1; 1265 return 1;
1405} 1266}
1406 1267
1407int audit_filter_user(void) 1268int audit_filter_user(int type)
1408{ 1269{
1409 enum audit_state state = AUDIT_DISABLED; 1270 enum audit_state state = AUDIT_DISABLED;
1410 struct audit_entry *e; 1271 struct audit_entry *e;
@@ -1412,7 +1273,7 @@ int audit_filter_user(void)
1412 1273
1413 rcu_read_lock(); 1274 rcu_read_lock();
1414 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { 1275 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
1415 if (audit_filter_user_rules(&e->rule, &state)) { 1276 if (audit_filter_user_rules(&e->rule, type, &state)) {
1416 if (state == AUDIT_DISABLED) 1277 if (state == AUDIT_DISABLED)
1417 ret = 0; 1278 ret = 0;
1418 break; 1279 break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a9..3c8a601324a2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -76,11 +76,6 @@
76#define AUDITSC_SUCCESS 1 76#define AUDITSC_SUCCESS 1
77#define AUDITSC_FAILURE 2 77#define AUDITSC_FAILURE 2
78 78
79/* AUDIT_NAMES is the number of slots we reserve in the audit_context
80 * for saving names from getname(). If we get more names we will allocate
81 * a name dynamically and also add those to the list anchored by names_list. */
82#define AUDIT_NAMES 5
83
84/* no execve audit message should be longer than this (userspace limits) */ 79/* no execve audit message should be longer than this (userspace limits) */
85#define MAX_EXECVE_AUDIT_LEN 7500 80#define MAX_EXECVE_AUDIT_LEN 7500
86 81
@@ -90,44 +85,6 @@ int audit_n_rules;
90/* determines whether we collect data for signals sent */ 85/* determines whether we collect data for signals sent */
91int audit_signals; 86int audit_signals;
92 87
93struct audit_cap_data {
94 kernel_cap_t permitted;
95 kernel_cap_t inheritable;
96 union {
97 unsigned int fE; /* effective bit of a file capability */
98 kernel_cap_t effective; /* effective set of a process */
99 };
100};
101
102/* When fs/namei.c:getname() is called, we store the pointer in name and
103 * we don't let putname() free it (instead we free all of the saved
104 * pointers at syscall exit time).
105 *
106 * Further, in fs/namei.c:path_lookup() we store the inode and device.
107 */
108struct audit_names {
109 struct list_head list; /* audit_context->names_list */
110 struct filename *name;
111 unsigned long ino;
112 dev_t dev;
113 umode_t mode;
114 kuid_t uid;
115 kgid_t gid;
116 dev_t rdev;
117 u32 osid;
118 struct audit_cap_data fcap;
119 unsigned int fcap_ver;
120 int name_len; /* number of name's characters to log */
121 unsigned char type; /* record type */
122 bool name_put; /* call __putname() for this name */
123 /*
124 * This was an allocated audit_names and not from the array of
125 * names allocated in the task audit context. Thus this name
126 * should be freed on syscall exit
127 */
128 bool should_free;
129};
130
131struct audit_aux_data { 88struct audit_aux_data {
132 struct audit_aux_data *next; 89 struct audit_aux_data *next;
133 int type; 90 int type;
@@ -175,106 +132,6 @@ struct audit_tree_refs {
175 struct audit_chunk *c[31]; 132 struct audit_chunk *c[31];
176}; 133};
177 134
178/* The per-task audit context. */
179struct audit_context {
180 int dummy; /* must be the first element */
181 int in_syscall; /* 1 if task is in a syscall */
182 enum audit_state state, current_state;
183 unsigned int serial; /* serial number for record */
184 int major; /* syscall number */
185 struct timespec ctime; /* time of syscall entry */
186 unsigned long argv[4]; /* syscall arguments */
187 long return_code;/* syscall return code */
188 u64 prio;
189 int return_valid; /* return code is valid */
190 /*
191 * The names_list is the list of all audit_names collected during this
192 * syscall. The first AUDIT_NAMES entries in the names_list will
193 * actually be from the preallocated_names array for performance
194 * reasons. Except during allocation they should never be referenced
195 * through the preallocated_names array and should only be found/used
196 * by running the names_list.
197 */
198 struct audit_names preallocated_names[AUDIT_NAMES];
199 int name_count; /* total records in names_list */
200 struct list_head names_list; /* anchor for struct audit_names->list */
201 char * filterkey; /* key for rule that triggered record */
202 struct path pwd;
203 struct audit_aux_data *aux;
204 struct audit_aux_data *aux_pids;
205 struct sockaddr_storage *sockaddr;
206 size_t sockaddr_len;
207 /* Save things to print about task_struct */
208 pid_t pid, ppid;
209 kuid_t uid, euid, suid, fsuid;
210 kgid_t gid, egid, sgid, fsgid;
211 unsigned long personality;
212 int arch;
213
214 pid_t target_pid;
215 kuid_t target_auid;
216 kuid_t target_uid;
217 unsigned int target_sessionid;
218 u32 target_sid;
219 char target_comm[TASK_COMM_LEN];
220
221 struct audit_tree_refs *trees, *first_trees;
222 struct list_head killed_trees;
223 int tree_count;
224
225 int type;
226 union {
227 struct {
228 int nargs;
229 long args[6];
230 } socketcall;
231 struct {
232 kuid_t uid;
233 kgid_t gid;
234 umode_t mode;
235 u32 osid;
236 int has_perm;
237 uid_t perm_uid;
238 gid_t perm_gid;
239 umode_t perm_mode;
240 unsigned long qbytes;
241 } ipc;
242 struct {
243 mqd_t mqdes;
244 struct mq_attr mqstat;
245 } mq_getsetattr;
246 struct {
247 mqd_t mqdes;
248 int sigev_signo;
249 } mq_notify;
250 struct {
251 mqd_t mqdes;
252 size_t msg_len;
253 unsigned int msg_prio;
254 struct timespec abs_timeout;
255 } mq_sendrecv;
256 struct {
257 int oflag;
258 umode_t mode;
259 struct mq_attr attr;
260 } mq_open;
261 struct {
262 pid_t pid;
263 struct audit_cap_data cap;
264 } capset;
265 struct {
266 int fd;
267 int flags;
268 } mmap;
269 };
270 int fds[2];
271
272#if AUDIT_DEBUG
273 int put_count;
274 int ino_count;
275#endif
276};
277
278static inline int open_arg(int flags, int mask) 135static inline int open_arg(int flags, int mask)
279{ 136{
280 int n = ACC_MODE(flags); 137 int n = ACC_MODE(flags);
@@ -633,9 +490,23 @@ static int audit_filter_rules(struct task_struct *tsk,
633 break; 490 break;
634 case AUDIT_GID: 491 case AUDIT_GID:
635 result = audit_gid_comparator(cred->gid, f->op, f->gid); 492 result = audit_gid_comparator(cred->gid, f->op, f->gid);
493 if (f->op == Audit_equal) {
494 if (!result)
495 result = in_group_p(f->gid);
496 } else if (f->op == Audit_not_equal) {
497 if (result)
498 result = !in_group_p(f->gid);
499 }
636 break; 500 break;
637 case AUDIT_EGID: 501 case AUDIT_EGID:
638 result = audit_gid_comparator(cred->egid, f->op, f->gid); 502 result = audit_gid_comparator(cred->egid, f->op, f->gid);
503 if (f->op == Audit_equal) {
504 if (!result)
505 result = in_egroup_p(f->gid);
506 } else if (f->op == Audit_not_equal) {
507 if (result)
508 result = !in_egroup_p(f->gid);
509 }
639 break; 510 break;
640 case AUDIT_SGID: 511 case AUDIT_SGID:
641 result = audit_gid_comparator(cred->sgid, f->op, f->gid); 512 result = audit_gid_comparator(cred->sgid, f->op, f->gid);
@@ -742,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk,
742 if (ctx) 613 if (ctx)
743 result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); 614 result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
744 break; 615 break;
616 case AUDIT_LOGINUID_SET:
617 result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
618 break;
745 case AUDIT_SUBJ_USER: 619 case AUDIT_SUBJ_USER:
746 case AUDIT_SUBJ_ROLE: 620 case AUDIT_SUBJ_ROLE:
747 case AUDIT_SUBJ_TYPE: 621 case AUDIT_SUBJ_TYPE:
@@ -987,6 +861,8 @@ static inline void audit_free_names(struct audit_context *context)
987 861
988#if AUDIT_DEBUG == 2 862#if AUDIT_DEBUG == 2
989 if (context->put_count + context->ino_count != context->name_count) { 863 if (context->put_count + context->ino_count != context->name_count) {
864 int i = 0;
865
990 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" 866 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
991 " name_count=%d put_count=%d" 867 " name_count=%d put_count=%d"
992 " ino_count=%d [NOT freeing]\n", 868 " ino_count=%d [NOT freeing]\n",
@@ -995,7 +871,7 @@ static inline void audit_free_names(struct audit_context *context)
995 context->name_count, context->put_count, 871 context->name_count, context->put_count,
996 context->ino_count); 872 context->ino_count);
997 list_for_each_entry(n, &context->names_list, list) { 873 list_for_each_entry(n, &context->names_list, list) {
998 printk(KERN_ERR "names[%d] = %p = %s\n", i, 874 printk(KERN_ERR "names[%d] = %p = %s\n", i++,
999 n->name, n->name->name ?: "(null)"); 875 n->name, n->name->name ?: "(null)");
1000 } 876 }
1001 dump_stack(); 877 dump_stack();
@@ -1010,7 +886,7 @@ static inline void audit_free_names(struct audit_context *context)
1010 list_for_each_entry_safe(n, next, &context->names_list, list) { 886 list_for_each_entry_safe(n, next, &context->names_list, list) {
1011 list_del(&n->list); 887 list_del(&n->list);
1012 if (n->name && n->name_put) 888 if (n->name && n->name_put)
1013 __putname(n->name); 889 final_putname(n->name);
1014 if (n->should_free) 890 if (n->should_free)
1015 kfree(n); 891 kfree(n);
1016 } 892 }
@@ -1034,21 +910,15 @@ static inline void audit_free_aux(struct audit_context *context)
1034 } 910 }
1035} 911}
1036 912
1037static inline void audit_zero_context(struct audit_context *context,
1038 enum audit_state state)
1039{
1040 memset(context, 0, sizeof(*context));
1041 context->state = state;
1042 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1043}
1044
1045static inline struct audit_context *audit_alloc_context(enum audit_state state) 913static inline struct audit_context *audit_alloc_context(enum audit_state state)
1046{ 914{
1047 struct audit_context *context; 915 struct audit_context *context;
1048 916
1049 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 917 context = kzalloc(sizeof(*context), GFP_KERNEL);
918 if (!context)
1050 return NULL; 919 return NULL;
1051 audit_zero_context(context, state); 920 context->state = state;
921 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1052 INIT_LIST_HEAD(&context->killed_trees); 922 INIT_LIST_HEAD(&context->killed_trees);
1053 INIT_LIST_HEAD(&context->names_list); 923 INIT_LIST_HEAD(&context->names_list);
1054 return context; 924 return context;
@@ -1099,88 +969,6 @@ static inline void audit_free_context(struct audit_context *context)
1099 kfree(context); 969 kfree(context);
1100} 970}
1101 971
1102void audit_log_task_context(struct audit_buffer *ab)
1103{
1104 char *ctx = NULL;
1105 unsigned len;
1106 int error;
1107 u32 sid;
1108
1109 security_task_getsecid(current, &sid);
1110 if (!sid)
1111 return;
1112
1113 error = security_secid_to_secctx(sid, &ctx, &len);
1114 if (error) {
1115 if (error != -EINVAL)
1116 goto error_path;
1117 return;
1118 }
1119
1120 audit_log_format(ab, " subj=%s", ctx);
1121 security_release_secctx(ctx, len);
1122 return;
1123
1124error_path:
1125 audit_panic("error in audit_log_task_context");
1126 return;
1127}
1128
1129EXPORT_SYMBOL(audit_log_task_context);
1130
1131void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1132{
1133 const struct cred *cred;
1134 char name[sizeof(tsk->comm)];
1135 struct mm_struct *mm = tsk->mm;
1136 char *tty;
1137
1138 if (!ab)
1139 return;
1140
1141 /* tsk == current */
1142 cred = current_cred();
1143
1144 spin_lock_irq(&tsk->sighand->siglock);
1145 if (tsk->signal && tsk->signal->tty)
1146 tty = tsk->signal->tty->name;
1147 else
1148 tty = "(none)";
1149 spin_unlock_irq(&tsk->sighand->siglock);
1150
1151
1152 audit_log_format(ab,
1153 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1154 " euid=%u suid=%u fsuid=%u"
1155 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
1156 sys_getppid(),
1157 tsk->pid,
1158 from_kuid(&init_user_ns, tsk->loginuid),
1159 from_kuid(&init_user_ns, cred->uid),
1160 from_kgid(&init_user_ns, cred->gid),
1161 from_kuid(&init_user_ns, cred->euid),
1162 from_kuid(&init_user_ns, cred->suid),
1163 from_kuid(&init_user_ns, cred->fsuid),
1164 from_kgid(&init_user_ns, cred->egid),
1165 from_kgid(&init_user_ns, cred->sgid),
1166 from_kgid(&init_user_ns, cred->fsgid),
1167 tsk->sessionid, tty);
1168
1169 get_task_comm(name, tsk);
1170 audit_log_format(ab, " comm=");
1171 audit_log_untrustedstring(ab, name);
1172
1173 if (mm) {
1174 down_read(&mm->mmap_sem);
1175 if (mm->exe_file)
1176 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1177 up_read(&mm->mmap_sem);
1178 }
1179 audit_log_task_context(ab);
1180}
1181
1182EXPORT_SYMBOL(audit_log_task_info);
1183
1184static int audit_log_pid_context(struct audit_context *context, pid_t pid, 972static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1185 kuid_t auid, kuid_t uid, unsigned int sessionid, 973 kuid_t auid, kuid_t uid, unsigned int sessionid,
1186 u32 sid, char *comm) 974 u32 sid, char *comm)
@@ -1197,12 +985,14 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1197 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, 985 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
1198 from_kuid(&init_user_ns, auid), 986 from_kuid(&init_user_ns, auid),
1199 from_kuid(&init_user_ns, uid), sessionid); 987 from_kuid(&init_user_ns, uid), sessionid);
1200 if (security_secid_to_secctx(sid, &ctx, &len)) { 988 if (sid) {
1201 audit_log_format(ab, " obj=(none)"); 989 if (security_secid_to_secctx(sid, &ctx, &len)) {
1202 rc = 1; 990 audit_log_format(ab, " obj=(none)");
1203 } else { 991 rc = 1;
1204 audit_log_format(ab, " obj=%s", ctx); 992 } else {
1205 security_release_secctx(ctx, len); 993 audit_log_format(ab, " obj=%s", ctx);
994 security_release_secctx(ctx, len);
995 }
1206 } 996 }
1207 audit_log_format(ab, " ocomm="); 997 audit_log_format(ab, " ocomm=");
1208 audit_log_untrustedstring(ab, comm); 998 audit_log_untrustedstring(ab, comm);
@@ -1396,35 +1186,6 @@ static void audit_log_execve_info(struct audit_context *context,
1396 kfree(buf); 1186 kfree(buf);
1397} 1187}
1398 1188
1399static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1400{
1401 int i;
1402
1403 audit_log_format(ab, " %s=", prefix);
1404 CAP_FOR_EACH_U32(i) {
1405 audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
1406 }
1407}
1408
1409static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1410{
1411 kernel_cap_t *perm = &name->fcap.permitted;
1412 kernel_cap_t *inh = &name->fcap.inheritable;
1413 int log = 0;
1414
1415 if (!cap_isclear(*perm)) {
1416 audit_log_cap(ab, "cap_fp", perm);
1417 log = 1;
1418 }
1419 if (!cap_isclear(*inh)) {
1420 audit_log_cap(ab, "cap_fi", inh);
1421 log = 1;
1422 }
1423
1424 if (log)
1425 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1426}
1427
1428static void show_special(struct audit_context *context, int *call_panic) 1189static void show_special(struct audit_context *context, int *call_panic)
1429{ 1190{
1430 struct audit_buffer *ab; 1191 struct audit_buffer *ab;
@@ -1522,68 +1283,6 @@ static void show_special(struct audit_context *context, int *call_panic)
1522 audit_log_end(ab); 1283 audit_log_end(ab);
1523} 1284}
1524 1285
1525static void audit_log_name(struct audit_context *context, struct audit_names *n,
1526 int record_num, int *call_panic)
1527{
1528 struct audit_buffer *ab;
1529 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1530 if (!ab)
1531 return; /* audit_panic has been called */
1532
1533 audit_log_format(ab, "item=%d", record_num);
1534
1535 if (n->name) {
1536 switch (n->name_len) {
1537 case AUDIT_NAME_FULL:
1538 /* log the full path */
1539 audit_log_format(ab, " name=");
1540 audit_log_untrustedstring(ab, n->name->name);
1541 break;
1542 case 0:
1543 /* name was specified as a relative path and the
1544 * directory component is the cwd */
1545 audit_log_d_path(ab, " name=", &context->pwd);
1546 break;
1547 default:
1548 /* log the name's directory component */
1549 audit_log_format(ab, " name=");
1550 audit_log_n_untrustedstring(ab, n->name->name,
1551 n->name_len);
1552 }
1553 } else
1554 audit_log_format(ab, " name=(null)");
1555
1556 if (n->ino != (unsigned long)-1) {
1557 audit_log_format(ab, " inode=%lu"
1558 " dev=%02x:%02x mode=%#ho"
1559 " ouid=%u ogid=%u rdev=%02x:%02x",
1560 n->ino,
1561 MAJOR(n->dev),
1562 MINOR(n->dev),
1563 n->mode,
1564 from_kuid(&init_user_ns, n->uid),
1565 from_kgid(&init_user_ns, n->gid),
1566 MAJOR(n->rdev),
1567 MINOR(n->rdev));
1568 }
1569 if (n->osid != 0) {
1570 char *ctx = NULL;
1571 u32 len;
1572 if (security_secid_to_secctx(
1573 n->osid, &ctx, &len)) {
1574 audit_log_format(ab, " osid=%u", n->osid);
1575 *call_panic = 2;
1576 } else {
1577 audit_log_format(ab, " obj=%s", ctx);
1578 security_release_secctx(ctx, len);
1579 }
1580 }
1581
1582 audit_log_fcaps(ab, n);
1583
1584 audit_log_end(ab);
1585}
1586
1587static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1286static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1588{ 1287{
1589 int i, call_panic = 0; 1288 int i, call_panic = 0;
@@ -1701,7 +1400,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1701 1400
1702 i = 0; 1401 i = 0;
1703 list_for_each_entry(n, &context->names_list, list) 1402 list_for_each_entry(n, &context->names_list, list)
1704 audit_log_name(context, n, i++, &call_panic); 1403 audit_log_name(context, n, NULL, i++, &call_panic);
1705 1404
1706 /* Send end of event record to help user space know we are finished */ 1405 /* Send end of event record to help user space know we are finished */
1707 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1406 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -2036,18 +1735,18 @@ void audit_putname(struct filename *name)
2036 BUG_ON(!context); 1735 BUG_ON(!context);
2037 if (!context->in_syscall) { 1736 if (!context->in_syscall) {
2038#if AUDIT_DEBUG == 2 1737#if AUDIT_DEBUG == 2
2039 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", 1738 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n",
2040 __FILE__, __LINE__, context->serial, name); 1739 __FILE__, __LINE__, context->serial, name);
2041 if (context->name_count) { 1740 if (context->name_count) {
2042 struct audit_names *n; 1741 struct audit_names *n;
2043 int i; 1742 int i = 0;
2044 1743
2045 list_for_each_entry(n, &context->names_list, list) 1744 list_for_each_entry(n, &context->names_list, list)
2046 printk(KERN_ERR "name[%d] = %p = %s\n", i, 1745 printk(KERN_ERR "name[%d] = %p = %s\n", i++,
2047 n->name, n->name->name ?: "(null)"); 1746 n->name, n->name->name ?: "(null)");
2048 } 1747 }
2049#endif 1748#endif
2050 __putname(name); 1749 final_putname(name);
2051 } 1750 }
2052#if AUDIT_DEBUG 1751#if AUDIT_DEBUG
2053 else { 1752 else {
@@ -2066,41 +1765,6 @@ void audit_putname(struct filename *name)
2066#endif 1765#endif
2067} 1766}
2068 1767
2069static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
2070{
2071 struct cpu_vfs_cap_data caps;
2072 int rc;
2073
2074 if (!dentry)
2075 return 0;
2076
2077 rc = get_vfs_caps_from_disk(dentry, &caps);
2078 if (rc)
2079 return rc;
2080
2081 name->fcap.permitted = caps.permitted;
2082 name->fcap.inheritable = caps.inheritable;
2083 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2084 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
2085
2086 return 0;
2087}
2088
2089
2090/* Copy inode data into an audit_names. */
2091static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
2092 const struct inode *inode)
2093{
2094 name->ino = inode->i_ino;
2095 name->dev = inode->i_sb->s_dev;
2096 name->mode = inode->i_mode;
2097 name->uid = inode->i_uid;
2098 name->gid = inode->i_gid;
2099 name->rdev = inode->i_rdev;
2100 security_inode_getsecid(inode, &name->osid);
2101 audit_copy_fcaps(name, dentry);
2102}
2103
2104/** 1768/**
2105 * __audit_inode - store the inode and device from a lookup 1769 * __audit_inode - store the inode and device from a lookup
2106 * @name: name being audited 1770 * @name: name being audited
@@ -2309,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid)
2309 unsigned int sessionid; 1973 unsigned int sessionid;
2310 1974
2311#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE 1975#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2312 if (uid_valid(task->loginuid)) 1976 if (audit_loginuid_set(task))
2313 return -EPERM; 1977 return -EPERM;
2314#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ 1978#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2315 if (!capable(CAP_AUDIT_CONTROL)) 1979 if (!capable(CAP_AUDIT_CONTROL))
@@ -2477,17 +2141,20 @@ int __audit_bprm(struct linux_binprm *bprm)
2477 2141
2478/** 2142/**
2479 * audit_socketcall - record audit data for sys_socketcall 2143 * audit_socketcall - record audit data for sys_socketcall
2480 * @nargs: number of args 2144 * @nargs: number of args, which should not be more than AUDITSC_ARGS.
2481 * @args: args array 2145 * @args: args array
2482 * 2146 *
2483 */ 2147 */
2484void __audit_socketcall(int nargs, unsigned long *args) 2148int __audit_socketcall(int nargs, unsigned long *args)
2485{ 2149{
2486 struct audit_context *context = current->audit_context; 2150 struct audit_context *context = current->audit_context;
2487 2151
2152 if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
2153 return -EINVAL;
2488 context->type = AUDIT_SOCKETCALL; 2154 context->type = AUDIT_SOCKETCALL;
2489 context->socketcall.nargs = nargs; 2155 context->socketcall.nargs = nargs;
2490 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); 2156 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
2157 return 0;
2491} 2158}
2492 2159
2493/** 2160/**
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
393EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
394 394
395/** 395/**
396 * file_ns_capable - Determine if the file's opener had a capability in effect
397 * @file: The file we want to check
398 * @ns: The usernamespace we want the capability in
399 * @cap: The capability to be tested for
400 *
401 * Return true if task that opened the file had a capability in effect
402 * when the file was opened.
403 *
404 * This does not set PF_SUPERPRIV because the caller may not
405 * actually be privileged.
406 */
407bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
408{
409 if (WARN_ON_ONCE(!cap_valid(cap)))
410 return false;
411
412 if (security_capable(file->f_cred, ns, cap) == 0)
413 return true;
414
415 return false;
416}
417EXPORT_SYMBOL(file_ns_capable);
418
419/**
396 * capable - Determine if the current task has a superior capability in effect 420 * capable - Determine if the current task has a superior capability in effect
397 * @cap: The capability to be tested for 421 * @cap: The capability to be tested for
398 * 422 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..a7c9e6ddb979 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
30#include <linux/cred.h> 30#include <linux/cred.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/errno.h> 32#include <linux/errno.h>
33#include <linux/fs.h>
34#include <linux/init_task.h> 33#include <linux/init_task.h>
35#include <linux/kernel.h> 34#include <linux/kernel.h>
36#include <linux/list.h> 35#include <linux/list.h>
@@ -59,7 +58,7 @@
59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h> 59#include <linux/eventfd.h>
61#include <linux/poll.h> 60#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
63#include <linux/kthread.h> 62#include <linux/kthread.h>
64 63
65#include <linux/atomic.h> 64#include <linux/atomic.h>
@@ -83,7 +82,13 @@
83 * B happens only through cgroup_show_options() and using cgroup_root_mutex 82 * B happens only through cgroup_show_options() and using cgroup_root_mutex
84 * breaks it. 83 * breaks it.
85 */ 84 */
85#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex);
87EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
88#else
86static DEFINE_MUTEX(cgroup_mutex); 89static DEFINE_MUTEX(cgroup_mutex);
90#endif
91
87static DEFINE_MUTEX(cgroup_root_mutex); 92static DEFINE_MUTEX(cgroup_root_mutex);
88 93
89/* 94/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98#include <linux/cgroup_subsys.h> 103#include <linux/cgroup_subsys.h>
99}; 104};
100 105
101#define MAX_CGROUP_ROOT_NAMELEN 64
102
103/*
104 * A cgroupfs_root represents the root of a cgroup hierarchy,
105 * and may be associated with a superblock to form an active
106 * hierarchy
107 */
108struct cgroupfs_root {
109 struct super_block *sb;
110
111 /*
112 * The bitmask of subsystems intended to be attached to this
113 * hierarchy
114 */
115 unsigned long subsys_mask;
116
117 /* Unique id for this hierarchy. */
118 int hierarchy_id;
119
120 /* The bitmask of subsystems currently attached to this hierarchy */
121 unsigned long actual_subsys_mask;
122
123 /* A list running through the attached subsystems */
124 struct list_head subsys_list;
125
126 /* The root cgroup for this hierarchy */
127 struct cgroup top_cgroup;
128
129 /* Tracks how many cgroups are currently defined in hierarchy.*/
130 int number_of_cgroups;
131
132 /* A list running through the active hierarchies */
133 struct list_head root_list;
134
135 /* All cgroups on this root, cgroup_mutex protected */
136 struct list_head allcg_list;
137
138 /* Hierarchy-specific flags */
139 unsigned long flags;
140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
144 /* The path to use for release notifications. */
145 char release_agent_path[PATH_MAX];
146
147 /* The name for this hierarchy - may be empty */
148 char name[MAX_CGROUP_ROOT_NAMELEN];
149};
150
151/* 106/*
152 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
153 * subsystems that are otherwise unattached - it never has more than a 108 * subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
162 struct list_head node; 117 struct list_head node;
163 struct dentry *dentry; 118 struct dentry *dentry;
164 struct cftype *type; 119 struct cftype *type;
120
121 /* file xattrs */
122 struct simple_xattrs xattrs;
165}; 123};
166 124
167/* 125/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
238/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
239#define dummytop (&rootnode.top_cgroup) 197#define dummytop (&rootnode.top_cgroup)
240 198
199static struct cgroup_name root_cgroup_name = { .name = "/" };
200
241/* This flag indicates whether tasks in the fork and exit paths should 201/* This flag indicates whether tasks in the fork and exit paths should
242 * check for fork/exit handlers to call. This avoids us having to do 202 * check for fork/exit handlers to call. This avoids us having to do
243 * extra work in the fork/exit path if none of the subsystems need to 203 * extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add); 210 struct cftype cfts[], bool is_add);
251 211
252#ifdef CONFIG_PROVE_LOCKING
253int cgroup_lock_is_held(void)
254{
255 return lockdep_is_held(&cgroup_mutex);
256}
257#else /* #ifdef CONFIG_PROVE_LOCKING */
258int cgroup_lock_is_held(void)
259{
260 return mutex_is_locked(&cgroup_mutex);
261}
262#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
263
264EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
265
266static int css_unbias_refcnt(int refcnt) 212static int css_unbias_refcnt(int refcnt)
267{ 213{
268 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; 214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
282 return test_bit(CGRP_REMOVED, &cgrp->flags); 228 return test_bit(CGRP_REMOVED, &cgrp->flags);
283} 229}
284 230
285/* bits in struct cgroupfs_root flags field */ 231/**
286enum { 232 * cgroup_is_descendant - test ancestry
287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 233 * @cgrp: the cgroup to be tested
288 ROOT_XATTR, /* supports extended attributes */ 234 * @ancestor: possible ancestor of @cgrp
289}; 235 *
236 * Test whether @cgrp is a descendant of @ancestor. It also returns %true
237 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
238 * and @ancestor are accessible.
239 */
240bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
241{
242 while (cgrp) {
243 if (cgrp == ancestor)
244 return true;
245 cgrp = cgrp->parent;
246 }
247 return false;
248}
249EXPORT_SYMBOL_GPL(cgroup_is_descendant);
290 250
291static int cgroup_is_releasable(const struct cgroup *cgrp) 251static int cgroup_is_releasable(const struct cgroup *cgrp)
292{ 252{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
327 return __d_cfe(dentry)->type; 287 return __d_cfe(dentry)->type;
328} 288}
329 289
290/**
291 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
292 * @cgrp: the cgroup to be checked for liveness
293 *
294 * On success, returns true; the mutex should be later unlocked. On
295 * failure returns false with no lock held.
296 */
297static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{
299 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) {
301 mutex_unlock(&cgroup_mutex);
302 return false;
303 }
304 return true;
305}
306
330/* the list of cgroups eligible for automatic release. Protected by 307/* the list of cgroups eligible for automatic release. Protected by
331 * release_list_lock */ 308 * release_list_lock */
332static LIST_HEAD(release_list); 309static LIST_HEAD(release_list);
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
800 * update of a tasks cgroup pointer by cgroup_attach_task() 777 * update of a tasks cgroup pointer by cgroup_attach_task()
801 */ 778 */
802 779
803/**
804 * cgroup_lock - lock out any changes to cgroup structures
805 *
806 */
807void cgroup_lock(void)
808{
809 mutex_lock(&cgroup_mutex);
810}
811EXPORT_SYMBOL_GPL(cgroup_lock);
812
813/**
814 * cgroup_unlock - release lock on cgroup changes
815 *
816 * Undo the lock taken in a previous cgroup_lock() call.
817 */
818void cgroup_unlock(void)
819{
820 mutex_unlock(&cgroup_mutex);
821}
822EXPORT_SYMBOL_GPL(cgroup_unlock);
823
824/* 780/*
825 * A couple of forward declarations required, due to cyclic reference loop: 781 * A couple of forward declarations required, due to cyclic reference loop:
826 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 782 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
859 return inode; 815 return inode;
860} 816}
861 817
818static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
819{
820 struct cgroup_name *name;
821
822 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
823 if (!name)
824 return NULL;
825 strcpy(name->name, dentry->d_name.name);
826 return name;
827}
828
862static void cgroup_free_fn(struct work_struct *work) 829static void cgroup_free_fn(struct work_struct *work)
863{ 830{
864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)
875 mutex_unlock(&cgroup_mutex); 842 mutex_unlock(&cgroup_mutex);
876 843
877 /* 844 /*
845 * We get a ref to the parent's dentry, and put the ref when
846 * this cgroup is being freed, so it's guaranteed that the
847 * parent won't be destroyed before its children.
848 */
849 dput(cgrp->parent->dentry);
850
851 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
852
853 /*
878 * Drop the active superblock reference that we took when we 854 * Drop the active superblock reference that we took when we
879 * created the cgroup 855 * created the cgroup. This will free cgrp->root, if we are
856 * holding the last reference to @sb.
880 */ 857 */
881 deactivate_super(cgrp->root->sb); 858 deactivate_super(cgrp->root->sb);
882 859
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)
888 865
889 simple_xattrs_free(&cgrp->xattrs); 866 simple_xattrs_free(&cgrp->xattrs);
890 867
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 868 kfree(rcu_dereference_raw(cgrp->name));
892 kfree(cgrp); 869 kfree(cgrp);
893} 870}
894 871
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
910 } else { 887 } else {
911 struct cfent *cfe = __d_cfe(dentry); 888 struct cfent *cfe = __d_cfe(dentry);
912 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 889 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
913 struct cftype *cft = cfe->type;
914 890
915 WARN_ONCE(!list_empty(&cfe->node) && 891 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup, 892 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name); 893 "cfe still linked for %s\n", cfe->type->name);
894 simple_xattrs_free(&cfe->xattrs);
918 kfree(cfe); 895 kfree(cfe);
919 simple_xattrs_free(&cft->xattrs);
920 } 896 }
921 iput(inode); 897 iput(inode);
922} 898}
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1108 mutex_lock(&cgroup_root_mutex); 1084 mutex_lock(&cgroup_root_mutex);
1109 for_each_subsys(root, ss) 1085 for_each_subsys(root, ss)
1110 seq_printf(seq, ",%s", ss->name); 1086 seq_printf(seq, ",%s", ss->name);
1111 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior");
1089 if (root->flags & CGRP_ROOT_NOPREFIX)
1112 seq_puts(seq, ",noprefix"); 1090 seq_puts(seq, ",noprefix");
1113 if (test_bit(ROOT_XATTR, &root->flags)) 1091 if (root->flags & CGRP_ROOT_XATTR)
1114 seq_puts(seq, ",xattr"); 1092 seq_puts(seq, ",xattr");
1115 if (strlen(root->release_agent_path)) 1093 if (strlen(root->release_agent_path))
1116 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1094 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1172 all_ss = true; 1150 all_ss = true;
1173 continue; 1151 continue;
1174 } 1152 }
1153 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1154 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1155 continue;
1156 }
1175 if (!strcmp(token, "noprefix")) { 1157 if (!strcmp(token, "noprefix")) {
1176 set_bit(ROOT_NOPREFIX, &opts->flags); 1158 opts->flags |= CGRP_ROOT_NOPREFIX;
1177 continue; 1159 continue;
1178 } 1160 }
1179 if (!strcmp(token, "clone_children")) { 1161 if (!strcmp(token, "clone_children")) {
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1181 continue; 1163 continue;
1182 } 1164 }
1183 if (!strcmp(token, "xattr")) { 1165 if (!strcmp(token, "xattr")) {
1184 set_bit(ROOT_XATTR, &opts->flags); 1166 opts->flags |= CGRP_ROOT_XATTR;
1185 continue; 1167 continue;
1186 } 1168 }
1187 if (!strncmp(token, "release_agent=", 14)) { 1169 if (!strncmp(token, "release_agent=", 14)) {
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1259 1241
1260 /* Consistency checks */ 1242 /* Consistency checks */
1261 1243
1244 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1245 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1246
1247 if (opts->flags & CGRP_ROOT_NOPREFIX) {
1248 pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1249 return -EINVAL;
1250 }
1251
1252 if (opts->cpuset_clone_children) {
1253 pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1254 return -EINVAL;
1255 }
1256 }
1257
1262 /* 1258 /*
1263 * Option noprefix was introduced just for backward compatibility 1259 * Option noprefix was introduced just for backward compatibility
1264 * with the old cpuset, so we allow noprefix only if mounting just 1260 * with the old cpuset, so we allow noprefix only if mounting just
1265 * the cpuset subsystem. 1261 * the cpuset subsystem.
1266 */ 1262 */
1267 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1263 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1268 (opts->subsys_mask & mask))
1269 return -EINVAL; 1264 return -EINVAL;
1270 1265
1271 1266
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1336 struct cgroup_sb_opts opts; 1331 struct cgroup_sb_opts opts;
1337 unsigned long added_mask, removed_mask; 1332 unsigned long added_mask, removed_mask;
1338 1333
1334 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1335 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1336 return -EINVAL;
1337 }
1338
1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1340 mutex_lock(&cgroup_mutex); 1340 mutex_lock(&cgroup_mutex);
1341 mutex_lock(&cgroup_root_mutex); 1341 mutex_lock(&cgroup_root_mutex);
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1421 INIT_LIST_HEAD(&root->allcg_list); 1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1423 cgrp->root = root;
1424 cgrp->top_cgroup = cgrp; 1424 cgrp->name = &root_cgroup_name;
1425 init_cgroup_housekeeping(cgrp); 1425 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1427}
@@ -1685,6 +1685,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1685 * any) is not needed 1685 * any) is not needed
1686 */ 1686 */
1687 cgroup_drop_root(opts.new_root); 1687 cgroup_drop_root(opts.new_root);
1688
1689 if (root->flags != opts.flags) {
1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL;
1693 goto drop_new_super;
1694 } else {
1695 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1696 }
1697 }
1698
1688 /* no subsys rebinding, so refcounts don't change */ 1699 /* no subsys rebinding, so refcounts don't change */
1689 drop_parsed_module_refcounts(opts.subsys_mask); 1700 drop_parsed_module_refcounts(opts.subsys_mask);
1690 } 1701 }
@@ -1769,49 +1780,48 @@ static struct kobject *cgroup_kobj;
1769 * @buf: the buffer to write the path into 1780 * @buf: the buffer to write the path into
1770 * @buflen: the length of the buffer 1781 * @buflen: the length of the buffer
1771 * 1782 *
1772 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1783 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1773 * reference. Writes path of cgroup into buf. Returns 0 on success, 1784 *
1774 * -errno on error. 1785 * We can't generate cgroup path using dentry->d_name, as accessing
1786 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1787 * inode's i_mutex, while on the other hand cgroup_path() can be called
1788 * with some irq-safe spinlocks held.
1775 */ 1789 */
1776int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1790int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1777{ 1791{
1778 struct dentry *dentry = cgrp->dentry; 1792 int ret = -ENAMETOOLONG;
1779 char *start; 1793 char *start;
1780 1794
1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1795 if (!cgrp->parent) {
1782 "cgroup_path() called without proper locking"); 1796 if (strlcpy(buf, "/", buflen) >= buflen)
1783 1797 return -ENAMETOOLONG;
1784 if (cgrp == dummytop) {
1785 /*
1786 * Inactive subsystems have no dentry for their root
1787 * cgroup
1788 */
1789 strcpy(buf, "/");
1790 return 0; 1798 return 0;
1791 } 1799 }
1792 1800
1793 start = buf + buflen - 1; 1801 start = buf + buflen - 1;
1794
1795 *start = '\0'; 1802 *start = '\0';
1796 for (;;) {
1797 int len = dentry->d_name.len;
1798 1803
1804 rcu_read_lock();
1805 do {
1806 const char *name = cgroup_name(cgrp);
1807 int len;
1808
1809 len = strlen(name);
1799 if ((start -= len) < buf) 1810 if ((start -= len) < buf)
1800 return -ENAMETOOLONG; 1811 goto out;
1801 memcpy(start, dentry->d_name.name, len); 1812 memcpy(start, name, len);
1802 cgrp = cgrp->parent;
1803 if (!cgrp)
1804 break;
1805 1813
1806 dentry = cgrp->dentry;
1807 if (!cgrp->parent)
1808 continue;
1809 if (--start < buf) 1814 if (--start < buf)
1810 return -ENAMETOOLONG; 1815 goto out;
1811 *start = '/'; 1816 *start = '/';
1812 } 1817
1818 cgrp = cgrp->parent;
1819 } while (cgrp->parent);
1820 ret = 0;
1813 memmove(buf, start, buf + buflen - start); 1821 memmove(buf, start, buf + buflen - start);
1814 return 0; 1822out:
1823 rcu_read_unlock();
1824 return ret;
1815} 1825}
1816EXPORT_SYMBOL_GPL(cgroup_path); 1826EXPORT_SYMBOL_GPL(cgroup_path);
1817 1827
@@ -1900,7 +1910,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1900 * 1910 *
1901 * Must be called with cgroup_mutex and threadgroup locked. 1911 * Must be called with cgroup_mutex and threadgroup locked.
1902 */ 1912 */
1903static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1913static void cgroup_task_migrate(struct cgroup *oldcgrp,
1904 struct task_struct *tsk, struct css_set *newcg) 1914 struct task_struct *tsk, struct css_set *newcg)
1905{ 1915{
1906 struct css_set *oldcg; 1916 struct css_set *oldcg;
@@ -1933,121 +1943,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1933} 1943}
1934 1944
1935/** 1945/**
1936 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1946 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1937 * @cgrp: the cgroup the task is attaching to
1938 * @tsk: the task to be attached
1939 *
1940 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1941 * @tsk during call.
1942 */
1943int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1944{
1945 int retval = 0;
1946 struct cgroup_subsys *ss, *failed_ss = NULL;
1947 struct cgroup *oldcgrp;
1948 struct cgroupfs_root *root = cgrp->root;
1949 struct cgroup_taskset tset = { };
1950 struct css_set *newcg;
1951
1952 /* @tsk either already exited or can't exit until the end */
1953 if (tsk->flags & PF_EXITING)
1954 return -ESRCH;
1955
1956 /* Nothing to do if the task is already in that cgroup */
1957 oldcgrp = task_cgroup_from_root(tsk, root);
1958 if (cgrp == oldcgrp)
1959 return 0;
1960
1961 tset.single.task = tsk;
1962 tset.single.cgrp = oldcgrp;
1963
1964 for_each_subsys(root, ss) {
1965 if (ss->can_attach) {
1966 retval = ss->can_attach(cgrp, &tset);
1967 if (retval) {
1968 /*
1969 * Remember on which subsystem the can_attach()
1970 * failed, so that we only call cancel_attach()
1971 * against the subsystems whose can_attach()
1972 * succeeded. (See below)
1973 */
1974 failed_ss = ss;
1975 goto out;
1976 }
1977 }
1978 }
1979
1980 newcg = find_css_set(tsk->cgroups, cgrp);
1981 if (!newcg) {
1982 retval = -ENOMEM;
1983 goto out;
1984 }
1985
1986 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1987
1988 for_each_subsys(root, ss) {
1989 if (ss->attach)
1990 ss->attach(cgrp, &tset);
1991 }
1992
1993out:
1994 if (retval) {
1995 for_each_subsys(root, ss) {
1996 if (ss == failed_ss)
1997 /*
1998 * This subsystem was the one that failed the
1999 * can_attach() check earlier, so we don't need
2000 * to call cancel_attach() against it or any
2001 * remaining subsystems.
2002 */
2003 break;
2004 if (ss->cancel_attach)
2005 ss->cancel_attach(cgrp, &tset);
2006 }
2007 }
2008 return retval;
2009}
2010
2011/**
2012 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2013 * @from: attach to all cgroups of a given task
2014 * @tsk: the task to be attached
2015 */
2016int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2017{
2018 struct cgroupfs_root *root;
2019 int retval = 0;
2020
2021 cgroup_lock();
2022 for_each_active_root(root) {
2023 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2024
2025 retval = cgroup_attach_task(from_cg, tsk);
2026 if (retval)
2027 break;
2028 }
2029 cgroup_unlock();
2030
2031 return retval;
2032}
2033EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2034
2035/**
2036 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2037 * @cgrp: the cgroup to attach to 1947 * @cgrp: the cgroup to attach to
2038 * @leader: the threadgroup leader task_struct of the group to be attached 1948 * @tsk: the task or the leader of the threadgroup to be attached
1949 * @threadgroup: attach the whole threadgroup?
2039 * 1950 *
2040 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1951 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
2041 * task_lock of each thread in leader's threadgroup individually in turn. 1952 * task_lock of @tsk or each thread in the threadgroup individually in turn.
2042 */ 1953 */
2043static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1954static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1955 bool threadgroup)
2044{ 1956{
2045 int retval, i, group_size; 1957 int retval, i, group_size;
2046 struct cgroup_subsys *ss, *failed_ss = NULL; 1958 struct cgroup_subsys *ss, *failed_ss = NULL;
2047 /* guaranteed to be initialized later, but the compiler needs this */
2048 struct cgroupfs_root *root = cgrp->root; 1959 struct cgroupfs_root *root = cgrp->root;
2049 /* threadgroup list cursor and array */ 1960 /* threadgroup list cursor and array */
2050 struct task_struct *tsk; 1961 struct task_struct *leader = tsk;
2051 struct task_and_cgroup *tc; 1962 struct task_and_cgroup *tc;
2052 struct flex_array *group; 1963 struct flex_array *group;
2053 struct cgroup_taskset tset = { }; 1964 struct cgroup_taskset tset = { };
@@ -2059,17 +1970,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2059 * group - group_rwsem prevents new threads from appearing, and if 1970 * group - group_rwsem prevents new threads from appearing, and if
2060 * threads exit, this will just be an over-estimate. 1971 * threads exit, this will just be an over-estimate.
2061 */ 1972 */
2062 group_size = get_nr_threads(leader); 1973 if (threadgroup)
1974 group_size = get_nr_threads(tsk);
1975 else
1976 group_size = 1;
2063 /* flex_array supports very large thread-groups better than kmalloc. */ 1977 /* flex_array supports very large thread-groups better than kmalloc. */
2064 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1978 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2065 if (!group) 1979 if (!group)
2066 return -ENOMEM; 1980 return -ENOMEM;
2067 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1981 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2068 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); 1982 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
2069 if (retval) 1983 if (retval)
2070 goto out_free_group_list; 1984 goto out_free_group_list;
2071 1985
2072 tsk = leader;
2073 i = 0; 1986 i = 0;
2074 /* 1987 /*
2075 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1988 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2098,6 +2011,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2098 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2011 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2099 BUG_ON(retval != 0); 2012 BUG_ON(retval != 0);
2100 i++; 2013 i++;
2014
2015 if (!threadgroup)
2016 break;
2101 } while_each_thread(leader, tsk); 2017 } while_each_thread(leader, tsk);
2102 rcu_read_unlock(); 2018 rcu_read_unlock();
2103 /* remember the number of threads in the array for later. */ 2019 /* remember the number of threads in the array for later. */
@@ -2143,7 +2059,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2143 */ 2059 */
2144 for (i = 0; i < group_size; i++) { 2060 for (i = 0; i < group_size; i++) {
2145 tc = flex_array_get(group, i); 2061 tc = flex_array_get(group, i);
2146 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2062 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
2147 } 2063 }
2148 /* nothing is sensitive to fork() after this point. */ 2064 /* nothing is sensitive to fork() after this point. */
2149 2065
@@ -2224,11 +2140,11 @@ retry_find_task:
2224 tsk = tsk->group_leader; 2140 tsk = tsk->group_leader;
2225 2141
2226 /* 2142 /*
2227 * Workqueue threads may acquire PF_THREAD_BOUND and become 2143 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2228 * trapped in a cpuset, or RT worker may be born in a cgroup 2144 * trapped in a cpuset, or RT worker may be born in a cgroup
2229 * with no rt_runtime allocated. Just say no. 2145 * with no rt_runtime allocated. Just say no.
2230 */ 2146 */
2231 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { 2147 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2232 ret = -EINVAL; 2148 ret = -EINVAL;
2233 rcu_read_unlock(); 2149 rcu_read_unlock();
2234 goto out_unlock_cgroup; 2150 goto out_unlock_cgroup;
@@ -2251,17 +2167,42 @@ retry_find_task:
2251 put_task_struct(tsk); 2167 put_task_struct(tsk);
2252 goto retry_find_task; 2168 goto retry_find_task;
2253 } 2169 }
2254 ret = cgroup_attach_proc(cgrp, tsk); 2170 }
2255 } else 2171
2256 ret = cgroup_attach_task(cgrp, tsk); 2172 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2173
2257 threadgroup_unlock(tsk); 2174 threadgroup_unlock(tsk);
2258 2175
2259 put_task_struct(tsk); 2176 put_task_struct(tsk);
2260out_unlock_cgroup: 2177out_unlock_cgroup:
2261 cgroup_unlock(); 2178 mutex_unlock(&cgroup_mutex);
2262 return ret; 2179 return ret;
2263} 2180}
2264 2181
2182/**
2183 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2184 * @from: attach to all cgroups of a given task
2185 * @tsk: the task to be attached
2186 */
2187int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2188{
2189 struct cgroupfs_root *root;
2190 int retval = 0;
2191
2192 mutex_lock(&cgroup_mutex);
2193 for_each_active_root(root) {
2194 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2195
2196 retval = cgroup_attach_task(from_cg, tsk, false);
2197 if (retval)
2198 break;
2199 }
2200 mutex_unlock(&cgroup_mutex);
2201
2202 return retval;
2203}
2204EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2205
2265static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2206static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2266{ 2207{
2267 return attach_task_by_pid(cgrp, pid, false); 2208 return attach_task_by_pid(cgrp, pid, false);
@@ -2272,24 +2213,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2272 return attach_task_by_pid(cgrp, tgid, true); 2213 return attach_task_by_pid(cgrp, tgid, true);
2273} 2214}
2274 2215
2275/**
2276 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2277 * @cgrp: the cgroup to be checked for liveness
2278 *
2279 * On success, returns true; the lock should be later released with
2280 * cgroup_unlock(). On failure returns false with no lock held.
2281 */
2282bool cgroup_lock_live_group(struct cgroup *cgrp)
2283{
2284 mutex_lock(&cgroup_mutex);
2285 if (cgroup_is_removed(cgrp)) {
2286 mutex_unlock(&cgroup_mutex);
2287 return false;
2288 }
2289 return true;
2290}
2291EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2292
2293static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2216static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2294 const char *buffer) 2217 const char *buffer)
2295{ 2218{
@@ -2301,7 +2224,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301 mutex_lock(&cgroup_root_mutex); 2224 mutex_lock(&cgroup_root_mutex);
2302 strcpy(cgrp->root->release_agent_path, buffer); 2225 strcpy(cgrp->root->release_agent_path, buffer);
2303 mutex_unlock(&cgroup_root_mutex); 2226 mutex_unlock(&cgroup_root_mutex);
2304 cgroup_unlock(); 2227 mutex_unlock(&cgroup_mutex);
2305 return 0; 2228 return 0;
2306} 2229}
2307 2230
@@ -2312,7 +2235,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2312 return -ENODEV; 2235 return -ENODEV;
2313 seq_puts(seq, cgrp->root->release_agent_path); 2236 seq_puts(seq, cgrp->root->release_agent_path);
2314 seq_putc(seq, '\n'); 2237 seq_putc(seq, '\n');
2315 cgroup_unlock(); 2238 mutex_unlock(&cgroup_mutex);
2239 return 0;
2240}
2241
2242static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
2243 struct seq_file *seq)
2244{
2245 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2316 return 0; 2246 return 0;
2317} 2247}
2318 2248
@@ -2537,13 +2467,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2537static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2467static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2538 struct inode *new_dir, struct dentry *new_dentry) 2468 struct inode *new_dir, struct dentry *new_dentry)
2539{ 2469{
2470 int ret;
2471 struct cgroup_name *name, *old_name;
2472 struct cgroup *cgrp;
2473
2474 /*
2475 * It's convinient to use parent dir's i_mutex to protected
2476 * cgrp->name.
2477 */
2478 lockdep_assert_held(&old_dir->i_mutex);
2479
2540 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2480 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2541 return -ENOTDIR; 2481 return -ENOTDIR;
2542 if (new_dentry->d_inode) 2482 if (new_dentry->d_inode)
2543 return -EEXIST; 2483 return -EEXIST;
2544 if (old_dir != new_dir) 2484 if (old_dir != new_dir)
2545 return -EIO; 2485 return -EIO;
2546 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2486
2487 cgrp = __d_cgrp(old_dentry);
2488
2489 name = cgroup_alloc_name(new_dentry);
2490 if (!name)
2491 return -ENOMEM;
2492
2493 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2494 if (ret) {
2495 kfree(name);
2496 return ret;
2497 }
2498
2499 old_name = cgrp->name;
2500 rcu_assign_pointer(cgrp->name, name);
2501
2502 kfree_rcu(old_name, rcu_head);
2503 return 0;
2547} 2504}
2548 2505
2549static struct simple_xattrs *__d_xattrs(struct dentry *dentry) 2506static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2551,13 +2508,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2551 if (S_ISDIR(dentry->d_inode->i_mode)) 2508 if (S_ISDIR(dentry->d_inode->i_mode))
2552 return &__d_cgrp(dentry)->xattrs; 2509 return &__d_cgrp(dentry)->xattrs;
2553 else 2510 else
2554 return &__d_cft(dentry)->xattrs; 2511 return &__d_cfe(dentry)->xattrs;
2555} 2512}
2556 2513
2557static inline int xattr_enabled(struct dentry *dentry) 2514static inline int xattr_enabled(struct dentry *dentry)
2558{ 2515{
2559 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 2516 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2560 return test_bit(ROOT_XATTR, &root->flags); 2517 return root->flags & CGRP_ROOT_XATTR;
2561} 2518}
2562 2519
2563static bool is_valid_xattr(const char *name) 2520static bool is_valid_xattr(const char *name)
@@ -2727,9 +2684,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2727 umode_t mode; 2684 umode_t mode;
2728 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2685 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2729 2686
2730 simple_xattrs_init(&cft->xattrs); 2687 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2731
2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2733 strcpy(name, subsys->name); 2688 strcpy(name, subsys->name);
2734 strcat(name, "."); 2689 strcat(name, ".");
2735 } 2690 }
@@ -2747,12 +2702,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 goto out; 2702 goto out;
2748 } 2703 }
2749 2704
2705 cfe->type = (void *)cft;
2706 cfe->dentry = dentry;
2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2709
2750 mode = cgroup_file_mode(cft); 2710 mode = cgroup_file_mode(cft);
2751 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); 2711 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2752 if (!error) { 2712 if (!error) {
2753 cfe->type = (void *)cft;
2754 cfe->dentry = dentry;
2755 dentry->d_fsdata = cfe;
2756 list_add_tail(&cfe->node, &parent->files); 2713 list_add_tail(&cfe->node, &parent->files);
2757 cfe = NULL; 2714 cfe = NULL;
2758 } 2715 }
@@ -2770,6 +2727,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2770 2727
2771 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2728 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2772 /* does cft->flags tell us to skip this file on @cgrp? */ 2729 /* does cft->flags tell us to skip this file on @cgrp? */
2730 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2731 continue;
2773 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2732 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2774 continue; 2733 continue;
2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2734 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -2998,11 +2957,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2998 WARN_ON_ONCE(!rcu_read_lock_held()); 2957 WARN_ON_ONCE(!rcu_read_lock_held());
2999 2958
3000 /* if first iteration, pretend we just visited @cgroup */ 2959 /* if first iteration, pretend we just visited @cgroup */
3001 if (!pos) { 2960 if (!pos)
3002 if (list_empty(&cgroup->children))
3003 return NULL;
3004 pos = cgroup; 2961 pos = cgroup;
3005 }
3006 2962
3007 /* visit the first child if exists */ 2963 /* visit the first child if exists */
3008 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 2964 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
@@ -3010,14 +2966,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
3010 return next; 2966 return next;
3011 2967
3012 /* no child, visit my or the closest ancestor's next sibling */ 2968 /* no child, visit my or the closest ancestor's next sibling */
3013 do { 2969 while (pos != cgroup) {
3014 next = list_entry_rcu(pos->sibling.next, struct cgroup, 2970 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3015 sibling); 2971 sibling);
3016 if (&next->sibling != &pos->parent->children) 2972 if (&next->sibling != &pos->parent->children)
3017 return next; 2973 return next;
3018 2974
3019 pos = pos->parent; 2975 pos = pos->parent;
3020 } while (pos != cgroup); 2976 }
3021 2977
3022 return NULL; 2978 return NULL;
3023} 2979}
@@ -3300,6 +3256,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3300 return 0; 3256 return 0;
3301} 3257}
3302 3258
3259static void cgroup_transfer_one_task(struct task_struct *task,
3260 struct cgroup_scanner *scan)
3261{
3262 struct cgroup *new_cgroup = scan->data;
3263
3264 mutex_lock(&cgroup_mutex);
3265 cgroup_attach_task(new_cgroup, task, false);
3266 mutex_unlock(&cgroup_mutex);
3267}
3268
3269/**
3270 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3271 * @to: cgroup to which the tasks will be moved
3272 * @from: cgroup in which the tasks currently reside
3273 */
3274int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3275{
3276 struct cgroup_scanner scan;
3277
3278 scan.cg = from;
3279 scan.test_task = NULL; /* select all tasks in cgroup */
3280 scan.process_task = cgroup_transfer_one_task;
3281 scan.heap = NULL;
3282 scan.data = to;
3283
3284 return cgroup_scan_tasks(&scan);
3285}
3286
3303/* 3287/*
3304 * Stuff for reading the 'tasks'/'procs' files. 3288 * Stuff for reading the 'tasks'/'procs' files.
3305 * 3289 *
@@ -3362,35 +3346,14 @@ static void pidlist_free(void *p)
3362 else 3346 else
3363 kfree(p); 3347 kfree(p);
3364} 3348}
3365static void *pidlist_resize(void *p, int newcount)
3366{
3367 void *newlist;
3368 /* note: if new alloc fails, old p will still be valid either way */
3369 if (is_vmalloc_addr(p)) {
3370 newlist = vmalloc(newcount * sizeof(pid_t));
3371 if (!newlist)
3372 return NULL;
3373 memcpy(newlist, p, newcount * sizeof(pid_t));
3374 vfree(p);
3375 } else {
3376 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3377 }
3378 return newlist;
3379}
3380 3349
3381/* 3350/*
3382 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3351 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3383 * If the new stripped list is sufficiently smaller and there's enough memory 3352 * Returns the number of unique elements.
3384 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3385 * number of unique elements.
3386 */ 3353 */
3387/* is the size difference enough that we should re-allocate the array? */ 3354static int pidlist_uniq(pid_t *list, int length)
3388#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3389static int pidlist_uniq(pid_t **p, int length)
3390{ 3355{
3391 int src, dest = 1; 3356 int src, dest = 1;
3392 pid_t *list = *p;
3393 pid_t *newlist;
3394 3357
3395 /* 3358 /*
3396 * we presume the 0th element is unique, so i starts at 1. trivial 3359 * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3411,16 +3374,6 @@ static int pidlist_uniq(pid_t **p, int length)
3411 dest++; 3374 dest++;
3412 } 3375 }
3413after: 3376after:
3414 /*
3415 * if the length difference is large enough, we want to allocate a
3416 * smaller buffer to save memory. if this fails due to out of memory,
3417 * we'll just stay with what we've got.
3418 */
3419 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3420 newlist = pidlist_resize(list, dest);
3421 if (newlist)
3422 *p = newlist;
3423 }
3424 return dest; 3377 return dest;
3425} 3378}
3426 3379
@@ -3516,7 +3469,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3516 /* now sort & (if procs) strip out duplicates */ 3469 /* now sort & (if procs) strip out duplicates */
3517 sort(array, length, sizeof(pid_t), cmppid, NULL); 3470 sort(array, length, sizeof(pid_t), cmppid, NULL);
3518 if (type == CGROUP_FILE_PROCS) 3471 if (type == CGROUP_FILE_PROCS)
3519 length = pidlist_uniq(&array, length); 3472 length = pidlist_uniq(array, length);
3520 l = cgroup_pidlist_find(cgrp, type); 3473 l = cgroup_pidlist_find(cgrp, type);
3521 if (!l) { 3474 if (!l) {
3522 pidlist_free(array); 3475 pidlist_free(array);
@@ -3930,11 +3883,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3930 if (ret) 3883 if (ret)
3931 goto fail; 3884 goto fail;
3932 3885
3933 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3886 efile->f_op->poll(efile, &event->pt);
3934 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3935 ret = 0;
3936 goto fail;
3937 }
3938 3887
3939 /* 3888 /*
3940 * Events should be removed after rmdir of cgroup directory, but before 3889 * Events should be removed after rmdir of cgroup directory, but before
@@ -4016,10 +3965,16 @@ static struct cftype files[] = {
4016 }, 3965 },
4017 { 3966 {
4018 .name = "cgroup.clone_children", 3967 .name = "cgroup.clone_children",
3968 .flags = CFTYPE_INSANE,
4019 .read_u64 = cgroup_clone_children_read, 3969 .read_u64 = cgroup_clone_children_read,
4020 .write_u64 = cgroup_clone_children_write, 3970 .write_u64 = cgroup_clone_children_write,
4021 }, 3971 },
4022 { 3972 {
3973 .name = "cgroup.sane_behavior",
3974 .flags = CFTYPE_ONLY_ON_ROOT,
3975 .read_seq_string = cgroup_sane_behavior_show,
3976 },
3977 {
4023 .name = "release_agent", 3978 .name = "release_agent",
4024 .flags = CFTYPE_ONLY_ON_ROOT, 3979 .flags = CFTYPE_ONLY_ON_ROOT,
4025 .read_seq_string = cgroup_release_agent_show, 3980 .read_seq_string = cgroup_release_agent_show,
@@ -4131,17 +4086,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4131 if (!(css->flags & CSS_ONLINE)) 4086 if (!(css->flags & CSS_ONLINE))
4132 return; 4087 return;
4133 4088
4134 /* 4089 if (ss->css_offline)
4135 * css_offline() should be called with cgroup_mutex unlocked. See
4136 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4137 * details. This temporary unlocking should go away once
4138 * cgroup_mutex is unexported from controllers.
4139 */
4140 if (ss->css_offline) {
4141 mutex_unlock(&cgroup_mutex);
4142 ss->css_offline(cgrp); 4090 ss->css_offline(cgrp);
4143 mutex_lock(&cgroup_mutex);
4144 }
4145 4091
4146 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4092 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4147} 4093}
@@ -4158,6 +4104,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4158 umode_t mode) 4104 umode_t mode)
4159{ 4105{
4160 struct cgroup *cgrp; 4106 struct cgroup *cgrp;
4107 struct cgroup_name *name;
4161 struct cgroupfs_root *root = parent->root; 4108 struct cgroupfs_root *root = parent->root;
4162 int err = 0; 4109 int err = 0;
4163 struct cgroup_subsys *ss; 4110 struct cgroup_subsys *ss;
@@ -4168,9 +4115,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4168 if (!cgrp) 4115 if (!cgrp)
4169 return -ENOMEM; 4116 return -ENOMEM;
4170 4117
4118 name = cgroup_alloc_name(dentry);
4119 if (!name)
4120 goto err_free_cgrp;
4121 rcu_assign_pointer(cgrp->name, name);
4122
4171 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4123 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4172 if (cgrp->id < 0) 4124 if (cgrp->id < 0)
4173 goto err_free_cgrp; 4125 goto err_free_name;
4174 4126
4175 /* 4127 /*
4176 * Only live parents can have children. Note that the liveliness 4128 * Only live parents can have children. Note that the liveliness
@@ -4198,7 +4150,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4198 4150
4199 cgrp->parent = parent; 4151 cgrp->parent = parent;
4200 cgrp->root = parent->root; 4152 cgrp->root = parent->root;
4201 cgrp->top_cgroup = parent->top_cgroup;
4202 4153
4203 if (notify_on_release(parent)) 4154 if (notify_on_release(parent))
4204 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4155 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,6 +4192,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4241 for_each_subsys(root, ss) 4192 for_each_subsys(root, ss)
4242 dget(dentry); 4193 dget(dentry);
4243 4194
4195 /* hold a ref to the parent's dentry */
4196 dget(parent->dentry);
4197
4244 /* creation succeeded, notify subsystems */ 4198 /* creation succeeded, notify subsystems */
4245 for_each_subsys(root, ss) { 4199 for_each_subsys(root, ss) {
4246 err = online_css(ss, cgrp); 4200 err = online_css(ss, cgrp);
@@ -4276,6 +4230,8 @@ err_free_all:
4276 deactivate_super(sb); 4230 deactivate_super(sb);
4277err_free_id: 4231err_free_id:
4278 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4232 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4233err_free_name:
4234 kfree(rcu_dereference_raw(cgrp->name));
4279err_free_cgrp: 4235err_free_cgrp:
4280 kfree(cgrp); 4236 kfree(cgrp);
4281 return err; 4237 return err;
@@ -4295,56 +4251,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4295 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4251 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4296} 4252}
4297 4253
4298/*
4299 * Check the reference count on each subsystem. Since we already
4300 * established that there are no tasks in the cgroup, if the css refcount
4301 * is also 1, then there should be no outstanding references, so the
4302 * subsystem is safe to destroy. We scan across all subsystems rather than
4303 * using the per-hierarchy linked list of mounted subsystems since we can
4304 * be called via check_for_release() with no synchronization other than
4305 * RCU, and the subsystem linked list isn't RCU-safe.
4306 */
4307static int cgroup_has_css_refs(struct cgroup *cgrp)
4308{
4309 int i;
4310
4311 /*
4312 * We won't need to lock the subsys array, because the subsystems
4313 * we're concerned about aren't going anywhere since our cgroup root
4314 * has a reference on them.
4315 */
4316 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4317 struct cgroup_subsys *ss = subsys[i];
4318 struct cgroup_subsys_state *css;
4319
4320 /* Skip subsystems not present or not in this hierarchy */
4321 if (ss == NULL || ss->root != cgrp->root)
4322 continue;
4323
4324 css = cgrp->subsys[ss->subsys_id];
4325 /*
4326 * When called from check_for_release() it's possible
4327 * that by this point the cgroup has been removed
4328 * and the css deleted. But a false-positive doesn't
4329 * matter, since it can only happen if the cgroup
4330 * has been deleted and hence no longer needs the
4331 * release agent to be called anyway.
4332 */
4333 if (css && css_refcnt(css) > 1)
4334 return 1;
4335 }
4336 return 0;
4337}
4338
4339static int cgroup_destroy_locked(struct cgroup *cgrp) 4254static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4255 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4341{ 4256{
4342 struct dentry *d = cgrp->dentry; 4257 struct dentry *d = cgrp->dentry;
4343 struct cgroup *parent = cgrp->parent; 4258 struct cgroup *parent = cgrp->parent;
4344 DEFINE_WAIT(wait);
4345 struct cgroup_event *event, *tmp; 4259 struct cgroup_event *event, *tmp;
4346 struct cgroup_subsys *ss; 4260 struct cgroup_subsys *ss;
4347 LIST_HEAD(tmp_list);
4348 4261
4349 lockdep_assert_held(&d->d_inode->i_mutex); 4262 lockdep_assert_held(&d->d_inode->i_mutex);
4350 lockdep_assert_held(&cgroup_mutex); 4263 lockdep_assert_held(&cgroup_mutex);
@@ -4468,7 +4381,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4468 * need to invoke fork callbacks here. */ 4381 * need to invoke fork callbacks here. */
4469 BUG_ON(!list_empty(&init_task.tasks)); 4382 BUG_ON(!list_empty(&init_task.tasks));
4470 4383
4471 ss->active = 1;
4472 BUG_ON(online_css(ss, dummytop)); 4384 BUG_ON(online_css(ss, dummytop));
4473 4385
4474 mutex_unlock(&cgroup_mutex); 4386 mutex_unlock(&cgroup_mutex);
@@ -4573,7 +4485,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4573 } 4485 }
4574 write_unlock(&css_set_lock); 4486 write_unlock(&css_set_lock);
4575 4487
4576 ss->active = 1;
4577 ret = online_css(ss, dummytop); 4488 ret = online_css(ss, dummytop);
4578 if (ret) 4489 if (ret)
4579 goto err_unload; 4490 goto err_unload;
@@ -4614,7 +4525,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4614 mutex_lock(&cgroup_mutex); 4525 mutex_lock(&cgroup_mutex);
4615 4526
4616 offline_css(ss, dummytop); 4527 offline_css(ss, dummytop);
4617 ss->active = 0;
4618 4528
4619 if (ss->use_id) 4529 if (ss->use_id)
4620 idr_destroy(&ss->idr); 4530 idr_destroy(&ss->idr);
@@ -4769,7 +4679,7 @@ out:
4769 */ 4679 */
4770 4680
4771/* TODO: Use a proper seq_file iterator */ 4681/* TODO: Use a proper seq_file iterator */
4772static int proc_cgroup_show(struct seq_file *m, void *v) 4682int proc_cgroup_show(struct seq_file *m, void *v)
4773{ 4683{
4774 struct pid *pid; 4684 struct pid *pid;
4775 struct task_struct *tsk; 4685 struct task_struct *tsk;
@@ -4821,19 +4731,6 @@ out:
4821 return retval; 4731 return retval;
4822} 4732}
4823 4733
4824static int cgroup_open(struct inode *inode, struct file *file)
4825{
4826 struct pid *pid = PROC_I(inode)->pid;
4827 return single_open(file, proc_cgroup_show, pid);
4828}
4829
4830const struct file_operations proc_cgroup_operations = {
4831 .open = cgroup_open,
4832 .read = seq_read,
4833 .llseek = seq_lseek,
4834 .release = single_release,
4835};
4836
4837/* Display information about each subsystem and each hierarchy */ 4734/* Display information about each subsystem and each hierarchy */
4838static int proc_cgroupstats_show(struct seq_file *m, void *v) 4735static int proc_cgroupstats_show(struct seq_file *m, void *v)
4839{ 4736{
@@ -4935,17 +4832,17 @@ void cgroup_post_fork(struct task_struct *child)
4935 * and addition to css_set. 4832 * and addition to css_set.
4936 */ 4833 */
4937 if (need_forkexit_callback) { 4834 if (need_forkexit_callback) {
4938 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4835 /*
4836 * fork/exit callbacks are supported only for builtin
4837 * subsystems, and the builtin section of the subsys
4838 * array is immutable, so we don't need to lock the
4839 * subsys array here. On the other hand, modular section
4840 * of the array can be freed at module unload, so we
4841 * can't touch that.
4842 */
4843 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4939 struct cgroup_subsys *ss = subsys[i]; 4844 struct cgroup_subsys *ss = subsys[i];
4940 4845
4941 /*
4942 * fork/exit callbacks are supported only for
4943 * builtin subsystems and we don't need further
4944 * synchronization as they never go away.
4945 */
4946 if (!ss || ss->module)
4947 continue;
4948
4949 if (ss->fork) 4846 if (ss->fork)
4950 ss->fork(child); 4847 ss->fork(child);
4951 } 4848 }
@@ -5010,13 +4907,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5010 tsk->cgroups = &init_css_set; 4907 tsk->cgroups = &init_css_set;
5011 4908
5012 if (run_callbacks && need_forkexit_callback) { 4909 if (run_callbacks && need_forkexit_callback) {
5013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4910 /*
4911 * fork/exit callbacks are supported only for builtin
4912 * subsystems, see cgroup_post_fork() for details.
4913 */
4914 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5014 struct cgroup_subsys *ss = subsys[i]; 4915 struct cgroup_subsys *ss = subsys[i];
5015 4916
5016 /* modular subsystems can't use callbacks */
5017 if (!ss || ss->module)
5018 continue;
5019
5020 if (ss->exit) { 4917 if (ss->exit) {
5021 struct cgroup *old_cgrp = 4918 struct cgroup *old_cgrp =
5022 rcu_dereference_raw(cg->subsys[i])->cgroup; 4919 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5030,44 +4927,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5030 put_css_set_taskexit(cg); 4927 put_css_set_taskexit(cg);
5031} 4928}
5032 4929
5033/**
5034 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
5035 * @cgrp: the cgroup in question
5036 * @task: the task in question
5037 *
5038 * See if @cgrp is a descendant of @task's cgroup in the appropriate
5039 * hierarchy.
5040 *
5041 * If we are sending in dummytop, then presumably we are creating
5042 * the top cgroup in the subsystem.
5043 *
5044 * Called only by the ns (nsproxy) cgroup.
5045 */
5046int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
5047{
5048 int ret;
5049 struct cgroup *target;
5050
5051 if (cgrp == dummytop)
5052 return 1;
5053
5054 target = task_cgroup_from_root(task, cgrp->root);
5055 while (cgrp != target && cgrp!= cgrp->top_cgroup)
5056 cgrp = cgrp->parent;
5057 ret = (cgrp == target);
5058 return ret;
5059}
5060
5061static void check_for_release(struct cgroup *cgrp) 4930static void check_for_release(struct cgroup *cgrp)
5062{ 4931{
5063 /* All of these checks rely on RCU to keep the cgroup 4932 /* All of these checks rely on RCU to keep the cgroup
5064 * structure alive */ 4933 * structure alive */
5065 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 4934 if (cgroup_is_releasable(cgrp) &&
5066 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 4935 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
5067 /* Control Group is currently removeable. If it's not 4936 /*
4937 * Control Group is currently removeable. If it's not
5068 * already queued for a userspace notification, queue 4938 * already queued for a userspace notification, queue
5069 * it now */ 4939 * it now
4940 */
5070 int need_schedule_work = 0; 4941 int need_schedule_work = 0;
4942
5071 raw_spin_lock(&release_list_lock); 4943 raw_spin_lock(&release_list_lock);
5072 if (!cgroup_is_removed(cgrp) && 4944 if (!cgroup_is_removed(cgrp) &&
5073 list_empty(&cgrp->release_list)) { 4945 list_empty(&cgrp->release_list)) {
@@ -5100,24 +4972,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
5100/* Caller must verify that the css is not for root cgroup */ 4972/* Caller must verify that the css is not for root cgroup */
5101void __css_put(struct cgroup_subsys_state *css) 4973void __css_put(struct cgroup_subsys_state *css)
5102{ 4974{
5103 struct cgroup *cgrp = css->cgroup;
5104 int v; 4975 int v;
5105 4976
5106 rcu_read_lock();
5107 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 4977 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5108 4978 if (v == 0)
5109 switch (v) {
5110 case 1:
5111 if (notify_on_release(cgrp)) {
5112 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5113 check_for_release(cgrp);
5114 }
5115 break;
5116 case 0:
5117 schedule_work(&css->dput_work); 4979 schedule_work(&css->dput_work);
5118 break;
5119 }
5120 rcu_read_unlock();
5121} 4980}
5122EXPORT_SYMBOL_GPL(__css_put); 4981EXPORT_SYMBOL_GPL(__css_put);
5123 4982
@@ -5416,55 +5275,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5416} 5275}
5417EXPORT_SYMBOL_GPL(css_lookup); 5276EXPORT_SYMBOL_GPL(css_lookup);
5418 5277
5419/**
5420 * css_get_next - lookup next cgroup under specified hierarchy.
5421 * @ss: pointer to subsystem
5422 * @id: current position of iteration.
5423 * @root: pointer to css. search tree under this.
5424 * @foundid: position of found object.
5425 *
5426 * Search next css under the specified hierarchy of rootid. Calling under
5427 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
5428 */
5429struct cgroup_subsys_state *
5430css_get_next(struct cgroup_subsys *ss, int id,
5431 struct cgroup_subsys_state *root, int *foundid)
5432{
5433 struct cgroup_subsys_state *ret = NULL;
5434 struct css_id *tmp;
5435 int tmpid;
5436 int rootid = css_id(root);
5437 int depth = css_depth(root);
5438
5439 if (!rootid)
5440 return NULL;
5441
5442 BUG_ON(!ss->use_id);
5443 WARN_ON_ONCE(!rcu_read_lock_held());
5444
5445 /* fill start point for scan */
5446 tmpid = id;
5447 while (1) {
5448 /*
5449 * scan next entry from bitmap(tree), tmpid is updated after
5450 * idr_get_next().
5451 */
5452 tmp = idr_get_next(&ss->idr, &tmpid);
5453 if (!tmp)
5454 break;
5455 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5456 ret = rcu_dereference(tmp->css);
5457 if (ret) {
5458 *foundid = tmpid;
5459 break;
5460 }
5461 }
5462 /* continue to scan from next id */
5463 tmpid = tmpid + 1;
5464 }
5465 return ret;
5466}
5467
5468/* 5278/*
5469 * get corresponding css from file open on cgroupfs directory 5279 * get corresponding css from file open on cgroupfs directory
5470 */ 5280 */
diff --git a/kernel/compat.c b/kernel/compat.c
index 19971d8c7299..0a09e481b70b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
516 return 0; 516 return 0;
517} 517}
518 518
519asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
520{
521 struct rusage r;
522 int ret;
523 mm_segment_t old_fs = get_fs();
524
525 set_fs(KERNEL_DS);
526 ret = sys_getrusage(who, (struct rusage __user *) &r);
527 set_fs(old_fs);
528
529 if (ret)
530 return ret;
531
532 if (put_compat_rusage(&r, ru))
533 return -EFAULT;
534
535 return 0;
536}
537
538COMPAT_SYSCALL_DEFINE4(wait4, 519COMPAT_SYSCALL_DEFINE4(wait4,
539 compat_pid_t, pid, 520 compat_pid_t, pid,
540 compat_uint_t __user *, stat_addr, 521 compat_uint_t __user *, stat_addr,
@@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
1138} 1119}
1139#endif 1120#endif
1140 1121
1141struct compat_sysinfo {
1142 s32 uptime;
1143 u32 loads[3];
1144 u32 totalram;
1145 u32 freeram;
1146 u32 sharedram;
1147 u32 bufferram;
1148 u32 totalswap;
1149 u32 freeswap;
1150 u16 procs;
1151 u16 pad;
1152 u32 totalhigh;
1153 u32 freehigh;
1154 u32 mem_unit;
1155 char _f[20-2*sizeof(u32)-sizeof(int)];
1156};
1157
1158asmlinkage long
1159compat_sys_sysinfo(struct compat_sysinfo __user *info)
1160{
1161 struct sysinfo s;
1162
1163 do_sysinfo(&s);
1164
1165 /* Check to see if any memory value is too large for 32-bit and scale
1166 * down if needed
1167 */
1168 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
1169 int bitcount = 0;
1170
1171 while (s.mem_unit < PAGE_SIZE) {
1172 s.mem_unit <<= 1;
1173 bitcount++;
1174 }
1175
1176 s.totalram >>= bitcount;
1177 s.freeram >>= bitcount;
1178 s.sharedram >>= bitcount;
1179 s.bufferram >>= bitcount;
1180 s.totalswap >>= bitcount;
1181 s.freeswap >>= bitcount;
1182 s.totalhigh >>= bitcount;
1183 s.freehigh >>= bitcount;
1184 }
1185
1186 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
1187 __put_user (s.uptime, &info->uptime) ||
1188 __put_user (s.loads[0], &info->loads[0]) ||
1189 __put_user (s.loads[1], &info->loads[1]) ||
1190 __put_user (s.loads[2], &info->loads[2]) ||
1191 __put_user (s.totalram, &info->totalram) ||
1192 __put_user (s.freeram, &info->freeram) ||
1193 __put_user (s.sharedram, &info->sharedram) ||
1194 __put_user (s.bufferram, &info->bufferram) ||
1195 __put_user (s.totalswap, &info->totalswap) ||
1196 __put_user (s.freeswap, &info->freeswap) ||
1197 __put_user (s.procs, &info->procs) ||
1198 __put_user (s.totalhigh, &info->totalhigh) ||
1199 __put_user (s.freehigh, &info->freehigh) ||
1200 __put_user (s.mem_unit, &info->mem_unit))
1201 return -EFAULT;
1202
1203 return 0;
1204}
1205
1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, 1122COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1207 compat_pid_t, pid, 1123 compat_pid_t, pid,
1208 struct compat_timespec __user *, interval) 1124 struct compat_timespec __user *, interval)
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075eed..c18b1f1ae515 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
79 if (!entry) 79 if (!entry)
80 return -ENOMEM; 80 return -ENOMEM;
81 81
82 entry->size = kernel_config_data_size; 82 proc_set_size(entry, kernel_config_data_size);
83 83
84 return 0; 84 return 0;
85} 85}
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 000000000000..59ab052ef7a0
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 000000000000..d5585f5e038e
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,118 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/tick.h>
7#include <linux/mm.h>
8
9#include <asm/tlb.h>
10
11#include <trace/events/power.h>
12
13static int __read_mostly cpu_idle_force_poll;
14
15void cpu_idle_poll_ctrl(bool enable)
16{
17 if (enable) {
18 cpu_idle_force_poll++;
19 } else {
20 cpu_idle_force_poll--;
21 WARN_ON_ONCE(cpu_idle_force_poll < 0);
22 }
23}
24
25#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
26static int __init cpu_idle_poll_setup(char *__unused)
27{
28 cpu_idle_force_poll = 1;
29 return 1;
30}
31__setup("nohlt", cpu_idle_poll_setup);
32
33static int __init cpu_idle_nopoll_setup(char *__unused)
34{
35 cpu_idle_force_poll = 0;
36 return 1;
37}
38__setup("hlt", cpu_idle_nopoll_setup);
39#endif
40
41static inline int cpu_idle_poll(void)
42{
43 rcu_idle_enter();
44 trace_cpu_idle_rcuidle(0, smp_processor_id());
45 local_irq_enable();
46 while (!need_resched())
47 cpu_relax();
48 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
49 rcu_idle_exit();
50 return 1;
51}
52
53/* Weak implementations for optional arch specific functions */
54void __weak arch_cpu_idle_prepare(void) { }
55void __weak arch_cpu_idle_enter(void) { }
56void __weak arch_cpu_idle_exit(void) { }
57void __weak arch_cpu_idle_dead(void) { }
58void __weak arch_cpu_idle(void)
59{
60 cpu_idle_force_poll = 1;
61}
62
63/*
64 * Generic idle loop implementation
65 */
66static void cpu_idle_loop(void)
67{
68 while (1) {
69 tick_nohz_idle_enter();
70
71 while (!need_resched()) {
72 check_pgt_cache();
73 rmb();
74
75 if (cpu_is_offline(smp_processor_id()))
76 arch_cpu_idle_dead();
77
78 local_irq_disable();
79 arch_cpu_idle_enter();
80
81 /*
82 * In poll mode we reenable interrupts and spin.
83 *
84 * Also if we detected in the wakeup from idle
85 * path that the tick broadcast device expired
86 * for us, we don't want to go deep idle as we
87 * know that the IPI is going to arrive right
88 * away
89 */
90 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
91 cpu_idle_poll();
92 } else {
93 current_clr_polling();
94 if (!need_resched()) {
95 stop_critical_timings();
96 rcu_idle_enter();
97 arch_cpu_idle();
98 WARN_ON_ONCE(irqs_disabled());
99 rcu_idle_exit();
100 start_critical_timings();
101 } else {
102 local_irq_enable();
103 }
104 current_set_polling();
105 }
106 arch_cpu_idle_exit();
107 }
108 tick_nohz_idle_exit();
109 schedule_preempt_disabled();
110 }
111}
112
113void cpu_startup_entry(enum cpuhp_state state)
114{
115 current_set_polling();
116 arch_cpu_idle_prepare();
117 cpu_idle_loop();
118}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..64b3f791bbe5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);
265static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
266 266
267/* 267/*
268 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
269 * buffers. They are statically allocated to prevent using excess stack
270 * when calling cpuset_print_task_mems_allowed().
271 */
272#define CPUSET_NAME_LEN (128)
273#define CPUSET_NODELIST_LEN (256)
274static char cpuset_name[CPUSET_NAME_LEN];
275static char cpuset_nodelist[CPUSET_NODELIST_LEN];
276static DEFINE_SPINLOCK(cpuset_buffer_lock);
277
278/*
279 * CPU / memory hotplug is handled asynchronously. 268 * CPU / memory hotplug is handled asynchronously.
280 */ 269 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq; 270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
780 lockdep_assert_held(&cpuset_mutex); 769 lockdep_assert_held(&cpuset_mutex);
781 get_online_cpus(); 770 get_online_cpus();
782 771
772 /*
773 * We have raced with CPU hotplug. Don't do anything to avoid
774 * passing doms with offlined cpu to partition_sched_domains().
775 * Anyways, hotplug work item will rebuild sched domains.
776 */
777 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
778 goto out;
779
783 /* Generate domain masks and attrs */ 780 /* Generate domain masks and attrs */
784 ndoms = generate_sched_domains(&doms, &attr); 781 ndoms = generate_sched_domains(&doms, &attr);
785 782
786 /* Have scheduler rebuild the domains */ 783 /* Have scheduler rebuild the domains */
787 partition_sched_domains(ndoms, doms, attr); 784 partition_sched_domains(ndoms, doms, attr);
788 785out:
789 put_online_cpus(); 786 put_online_cpus();
790} 787}
791#else /* !CONFIG_SMP */ 788#else /* !CONFIG_SMP */
792static void rebuild_sched_domains_locked(void) 789static void rebuild_sched_domains_locked(void)
793{ 790{
794} 791}
795
796static int generate_sched_domains(cpumask_var_t **domains,
797 struct sched_domain_attr **attributes)
798{
799 *domains = NULL;
800 return 1;
801}
802#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
803 793
804void rebuild_sched_domains(void) 794void rebuild_sched_domains(void)
@@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1388 1378
1389 cgroup_taskset_for_each(task, cgrp, tset) { 1379 cgroup_taskset_for_each(task, cgrp, tset) {
1390 /* 1380 /*
1391 * Kthreads bound to specific cpus cannot be moved to a new 1381 * Kthreads which disallow setaffinity shouldn't be moved
1392 * cpuset; we cannot change their cpu affinity and 1382 * to a new cpuset; we don't want to change their cpu
1393 * isolating such threads by their set of allowed nodes is 1383 * affinity and isolating such threads by their set of
1394 * unnecessary. Thus, cpusets are not applicable for such 1384 * allowed nodes is unnecessary. Thus, cpusets are not
1395 * threads. This prevents checking for success of 1385 * applicable for such threads. This prevents checking for
1396 * set_cpus_allowed_ptr() on all attached tasks before 1386 * success of set_cpus_allowed_ptr() on all attached tasks
1397 * cpus_allowed may be changed. 1387 * before cpus_allowed may be changed.
1398 */ 1388 */
1399 ret = -EINVAL; 1389 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1390 if (task->flags & PF_NO_SETAFFINITY)
1401 goto out_unlock; 1391 goto out_unlock;
1402 ret = security_task_setscheduler(task); 1392 ret = security_task_setscheduler(task);
1403 if (ret) 1393 if (ret)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
2005 return 0; 1995 return 0;
2006} 1996}
2007 1997
2008/**
2009 * cpuset_do_move_task - move a given task to another cpuset
2010 * @tsk: pointer to task_struct the task to move
2011 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
2012 *
2013 * Called by cgroup_scan_tasks() for each task in a cgroup.
2014 * Return nonzero to stop the walk through the tasks.
2015 */
2016static void cpuset_do_move_task(struct task_struct *tsk,
2017 struct cgroup_scanner *scan)
2018{
2019 struct cgroup *new_cgroup = scan->data;
2020
2021 cgroup_lock();
2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
2024}
2025
2026/**
2027 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
2028 * @from: cpuset in which the tasks currently reside
2029 * @to: cpuset to which the tasks will be moved
2030 *
2031 * Called with cpuset_mutex held
2032 * callback_mutex must not be held, as cpuset_attach() will take it.
2033 *
2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
2035 * calling callback functions for each.
2036 */
2037static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
2038{
2039 struct cgroup_scanner scan;
2040
2041 scan.cg = from->css.cgroup;
2042 scan.test_task = NULL; /* select all tasks in cgroup */
2043 scan.process_task = cpuset_do_move_task;
2044 scan.heap = NULL;
2045 scan.data = to->css.cgroup;
2046
2047 if (cgroup_scan_tasks(&scan))
2048 printk(KERN_ERR "move_member_tasks_to_cpuset: "
2049 "cgroup_scan_tasks failed\n");
2050}
2051
2052/* 1998/*
2053 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 1999 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2054 * or memory nodes, we need to walk over the cpuset hierarchy, 2000 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2069 nodes_empty(parent->mems_allowed)) 2015 nodes_empty(parent->mems_allowed))
2070 parent = parent_cs(parent); 2016 parent = parent_cs(parent);
2071 2017
2072 move_member_tasks_to_cpuset(cs, parent); 2018 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2019 rcu_read_lock();
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
2021 cgroup_name(cs->css.cgroup));
2022 rcu_read_unlock();
2023 }
2073} 2024}
2074 2025
2075/** 2026/**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2222 flush_workqueue(cpuset_propagate_hotplug_wq); 2173 flush_workqueue(cpuset_propagate_hotplug_wq);
2223 2174
2224 /* rebuild sched domains if cpus_allowed has changed */ 2175 /* rebuild sched domains if cpus_allowed has changed */
2225 if (cpus_updated) { 2176 if (cpus_updated)
2226 struct sched_domain_attr *attr; 2177 rebuild_sched_domains();
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2235 }
2236} 2178}
2237 2179
2238void cpuset_update_active_cpus(bool cpu_online) 2180void cpuset_update_active_cpus(bool cpu_online)
@@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)
2251 schedule_work(&cpuset_hotplug_work); 2193 schedule_work(&cpuset_hotplug_work);
2252} 2194}
2253 2195
2254#ifdef CONFIG_MEMORY_HOTPLUG
2255/* 2196/*
2256 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 2197 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2257 * Call this routine anytime after node_states[N_MEMORY] changes. 2198 * Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2263 schedule_work(&cpuset_hotplug_work); 2204 schedule_work(&cpuset_hotplug_work);
2264 return NOTIFY_OK; 2205 return NOTIFY_OK;
2265} 2206}
2266#endif 2207
2208static struct notifier_block cpuset_track_online_nodes_nb = {
2209 .notifier_call = cpuset_track_online_nodes,
2210 .priority = 10, /* ??! */
2211};
2267 2212
2268/** 2213/**
2269 * cpuset_init_smp - initialize cpus_allowed 2214 * cpuset_init_smp - initialize cpus_allowed
2270 * 2215 *
2271 * Description: Finish top cpuset after cpu, node maps are initialized 2216 * Description: Finish top cpuset after cpu, node maps are initialized
2272 **/ 2217 */
2273
2274void __init cpuset_init_smp(void) 2218void __init cpuset_init_smp(void)
2275{ 2219{
2276 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2277 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2221 top_cpuset.mems_allowed = node_states[N_MEMORY];
2278 2222
2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2280 2224
2281 cpuset_propagate_hotplug_wq = 2225 cpuset_propagate_hotplug_wq =
2282 alloc_ordered_workqueue("cpuset_hotplug", 0); 2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
@@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2592 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2536 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2593} 2537}
2594 2538
2539#define CPUSET_NODELIST_LEN (256)
2540
2595/** 2541/**
2596 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2542 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2597 * @task: pointer to task_struct of some task. 2543 * @task: pointer to task_struct of some task.
@@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2602 */ 2548 */
2603void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2549void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2604{ 2550{
2605 struct dentry *dentry; 2551 /* Statically allocated to prevent using excess stack. */
2552 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2553 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2606 2554
2607 dentry = task_cs(tsk)->css.cgroup->dentry; 2555 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2608 spin_lock(&cpuset_buffer_lock);
2609 2556
2610 if (!dentry) { 2557 rcu_read_lock();
2611 strcpy(cpuset_name, "/"); 2558 spin_lock(&cpuset_buffer_lock);
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618 2559
2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2560 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2620 tsk->mems_allowed); 2561 tsk->mems_allowed);
2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2562 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2622 tsk->comm, cpuset_name, cpuset_nodelist); 2563 tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
2564
2623 spin_unlock(&cpuset_buffer_lock); 2565 spin_unlock(&cpuset_buffer_lock);
2566 rcu_read_unlock();
2624} 2567}
2625 2568
2626/* 2569/*
@@ -2666,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void)
2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 2609 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2667 * anyway. 2610 * anyway.
2668 */ 2611 */
2669static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2612int proc_cpuset_show(struct seq_file *m, void *unused_v)
2670{ 2613{
2671 struct pid *pid; 2614 struct pid *pid;
2672 struct task_struct *tsk; 2615 struct task_struct *tsk;
@@ -2700,19 +2643,6 @@ out_free:
2700out: 2643out:
2701 return retval; 2644 return retval;
2702} 2645}
2703
2704static int cpuset_open(struct inode *inode, struct file *file)
2705{
2706 struct pid *pid = PROC_I(inode)->pid;
2707 return single_open(file, proc_cpuset_show, pid);
2708}
2709
2710const struct file_operations proc_cpuset_operations = {
2711 .open = cpuset_open,
2712 .read = seq_read,
2713 .llseek = seq_lseek,
2714 .release = single_release,
2715};
2716#endif /* CONFIG_PROC_PID_CPUSET */ 2646#endif /* CONFIG_PROC_PID_CPUSET */
2717 2647
2718/* Display task mems_allowed in /proc/<pid>/status file. */ 2648/* Display task mems_allowed in /proc/<pid>/status file. */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index c26278fd4851..0506d447aed2 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key)
775 775
776static struct sysrq_key_op sysrq_dbg_op = { 776static struct sysrq_key_op sysrq_dbg_op = {
777 .handler = sysrq_handle_dbg, 777 .handler = sysrq_handle_dbg,
778 .help_msg = "debug(G)", 778 .help_msg = "debug(g)",
779 .action_msg = "DEBUG", 779 .action_msg = "DEBUG",
780}; 780};
781#endif 781#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0cd86501c30..9dc297faf7c0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
18#include <linux/poll.h> 18#include <linux/poll.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/hash.h> 20#include <linux/hash.h>
21#include <linux/tick.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/dcache.h> 23#include <linux/dcache.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -37,6 +38,7 @@
37#include <linux/ftrace_event.h> 38#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h> 39#include <linux/hw_breakpoint.h>
39#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h>
40 42
41#include "internal.h" 43#include "internal.h"
42 44
@@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
234#ifdef CONFIG_CGROUP_PERF 236#ifdef CONFIG_CGROUP_PERF
235 237
236/* 238/*
239 * perf_cgroup_info keeps track of time_enabled for a cgroup.
240 * This is a per-cpu dynamically allocated data structure.
241 */
242struct perf_cgroup_info {
243 u64 time;
244 u64 timestamp;
245};
246
247struct perf_cgroup {
248 struct cgroup_subsys_state css;
249 struct perf_cgroup_info __percpu *info;
250};
251
252/*
237 * Must ensure cgroup is pinned (css_get) before calling 253 * Must ensure cgroup is pinned (css_get) before calling
238 * this function. In other words, we cannot call this function 254 * this function. In other words, we cannot call this function
239 * if there is no cgroup event for the current CPU context. 255 * if there is no cgroup event for the current CPU context.
@@ -251,7 +267,22 @@ perf_cgroup_match(struct perf_event *event)
251 struct perf_event_context *ctx = event->ctx; 267 struct perf_event_context *ctx = event->ctx;
252 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 268 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
253 269
254 return !event->cgrp || event->cgrp == cpuctx->cgrp; 270 /* @event doesn't care about cgroup */
271 if (!event->cgrp)
272 return true;
273
274 /* wants specific cgroup scope but @cpuctx isn't associated with any */
275 if (!cpuctx->cgrp)
276 return false;
277
278 /*
279 * Cgroup scoping is recursive. An event enabled for a cgroup is
280 * also enabled for all its descendant cgroups. If @cpuctx's
281 * cgroup is a descendant of @event's (the test covers identity
282 * case), it's a match.
283 */
284 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
285 event->cgrp->css.cgroup);
255} 286}
256 287
257static inline bool perf_tryget_cgroup(struct perf_event *event) 288static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -655,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
655 686
656 WARN_ON(!irqs_disabled()); 687 WARN_ON(!irqs_disabled());
657 688
658 if (list_empty(&cpuctx->rotation_list)) 689 if (list_empty(&cpuctx->rotation_list)) {
690 int was_empty = list_empty(head);
659 list_add(&cpuctx->rotation_list, head); 691 list_add(&cpuctx->rotation_list, head);
692 if (was_empty)
693 tick_nohz_full_kick();
694 }
660} 695}
661 696
662static void get_ctx(struct perf_event_context *ctx) 697static void get_ctx(struct perf_event_context *ctx)
@@ -961,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
961 if (sample_type & PERF_SAMPLE_PERIOD) 996 if (sample_type & PERF_SAMPLE_PERIOD)
962 size += sizeof(data->period); 997 size += sizeof(data->period);
963 998
999 if (sample_type & PERF_SAMPLE_WEIGHT)
1000 size += sizeof(data->weight);
1001
964 if (sample_type & PERF_SAMPLE_READ) 1002 if (sample_type & PERF_SAMPLE_READ)
965 size += event->read_size; 1003 size += event->read_size;
966 1004
1005 if (sample_type & PERF_SAMPLE_DATA_SRC)
1006 size += sizeof(data->data_src.val);
1007
967 event->header_size = size; 1008 event->header_size = size;
968} 1009}
969 1010
@@ -2555,6 +2596,16 @@ done:
2555 list_del_init(&cpuctx->rotation_list); 2596 list_del_init(&cpuctx->rotation_list);
2556} 2597}
2557 2598
2599#ifdef CONFIG_NO_HZ_FULL
2600bool perf_event_can_stop_tick(void)
2601{
2602 if (list_empty(&__get_cpu_var(rotation_list)))
2603 return true;
2604 else
2605 return false;
2606}
2607#endif
2608
2558void perf_event_task_tick(void) 2609void perf_event_task_tick(void)
2559{ 2610{
2560 struct list_head *head = &__get_cpu_var(rotation_list); 2611 struct list_head *head = &__get_cpu_var(rotation_list);
@@ -4178,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
4178 perf_output_sample_ustack(handle, 4229 perf_output_sample_ustack(handle,
4179 data->stack_user_size, 4230 data->stack_user_size,
4180 data->regs_user.regs); 4231 data->regs_user.regs);
4232
4233 if (sample_type & PERF_SAMPLE_WEIGHT)
4234 perf_output_put(handle, data->weight);
4235
4236 if (sample_type & PERF_SAMPLE_DATA_SRC)
4237 perf_output_put(handle, data->data_src.val);
4181} 4238}
4182 4239
4183void perf_prepare_sample(struct perf_event_header *header, 4240void perf_prepare_sample(struct perf_event_header *header,
@@ -4337,6 +4394,64 @@ perf_event_read_event(struct perf_event *event,
4337 perf_output_end(&handle); 4394 perf_output_end(&handle);
4338} 4395}
4339 4396
4397typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
4398typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4399
4400static void
4401perf_event_aux_ctx(struct perf_event_context *ctx,
4402 perf_event_aux_match_cb match,
4403 perf_event_aux_output_cb output,
4404 void *data)
4405{
4406 struct perf_event *event;
4407
4408 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4409 if (event->state < PERF_EVENT_STATE_INACTIVE)
4410 continue;
4411 if (!event_filter_match(event))
4412 continue;
4413 if (match(event, data))
4414 output(event, data);
4415 }
4416}
4417
4418static void
4419perf_event_aux(perf_event_aux_match_cb match,
4420 perf_event_aux_output_cb output,
4421 void *data,
4422 struct perf_event_context *task_ctx)
4423{
4424 struct perf_cpu_context *cpuctx;
4425 struct perf_event_context *ctx;
4426 struct pmu *pmu;
4427 int ctxn;
4428
4429 rcu_read_lock();
4430 list_for_each_entry_rcu(pmu, &pmus, entry) {
4431 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4432 if (cpuctx->unique_pmu != pmu)
4433 goto next;
4434 perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
4435 if (task_ctx)
4436 goto next;
4437 ctxn = pmu->task_ctx_nr;
4438 if (ctxn < 0)
4439 goto next;
4440 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4441 if (ctx)
4442 perf_event_aux_ctx(ctx, match, output, data);
4443next:
4444 put_cpu_ptr(pmu->pmu_cpu_context);
4445 }
4446
4447 if (task_ctx) {
4448 preempt_disable();
4449 perf_event_aux_ctx(task_ctx, match, output, data);
4450 preempt_enable();
4451 }
4452 rcu_read_unlock();
4453}
4454
4340/* 4455/*
4341 * task tracking -- fork/exit 4456 * task tracking -- fork/exit
4342 * 4457 *
@@ -4359,8 +4474,9 @@ struct perf_task_event {
4359}; 4474};
4360 4475
4361static void perf_event_task_output(struct perf_event *event, 4476static void perf_event_task_output(struct perf_event *event,
4362 struct perf_task_event *task_event) 4477 void *data)
4363{ 4478{
4479 struct perf_task_event *task_event = data;
4364 struct perf_output_handle handle; 4480 struct perf_output_handle handle;
4365 struct perf_sample_data sample; 4481 struct perf_sample_data sample;
4366 struct task_struct *task = task_event->task; 4482 struct task_struct *task = task_event->task;
@@ -4388,59 +4504,11 @@ out:
4388 task_event->event_id.header.size = size; 4504 task_event->event_id.header.size = size;
4389} 4505}
4390 4506
4391static int perf_event_task_match(struct perf_event *event) 4507static int perf_event_task_match(struct perf_event *event,
4392{ 4508 void *data __maybe_unused)
4393 if (event->state < PERF_EVENT_STATE_INACTIVE)
4394 return 0;
4395
4396 if (!event_filter_match(event))
4397 return 0;
4398
4399 if (event->attr.comm || event->attr.mmap ||
4400 event->attr.mmap_data || event->attr.task)
4401 return 1;
4402
4403 return 0;
4404}
4405
4406static void perf_event_task_ctx(struct perf_event_context *ctx,
4407 struct perf_task_event *task_event)
4408{ 4509{
4409 struct perf_event *event; 4510 return event->attr.comm || event->attr.mmap ||
4410 4511 event->attr.mmap_data || event->attr.task;
4411 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4412 if (perf_event_task_match(event))
4413 perf_event_task_output(event, task_event);
4414 }
4415}
4416
4417static void perf_event_task_event(struct perf_task_event *task_event)
4418{
4419 struct perf_cpu_context *cpuctx;
4420 struct perf_event_context *ctx;
4421 struct pmu *pmu;
4422 int ctxn;
4423
4424 rcu_read_lock();
4425 list_for_each_entry_rcu(pmu, &pmus, entry) {
4426 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4427 if (cpuctx->unique_pmu != pmu)
4428 goto next;
4429 perf_event_task_ctx(&cpuctx->ctx, task_event);
4430
4431 ctx = task_event->task_ctx;
4432 if (!ctx) {
4433 ctxn = pmu->task_ctx_nr;
4434 if (ctxn < 0)
4435 goto next;
4436 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4437 }
4438 if (ctx)
4439 perf_event_task_ctx(ctx, task_event);
4440next:
4441 put_cpu_ptr(pmu->pmu_cpu_context);
4442 }
4443 rcu_read_unlock();
4444} 4512}
4445 4513
4446static void perf_event_task(struct task_struct *task, 4514static void perf_event_task(struct task_struct *task,
@@ -4471,7 +4539,10 @@ static void perf_event_task(struct task_struct *task,
4471 }, 4539 },
4472 }; 4540 };
4473 4541
4474 perf_event_task_event(&task_event); 4542 perf_event_aux(perf_event_task_match,
4543 perf_event_task_output,
4544 &task_event,
4545 task_ctx);
4475} 4546}
4476 4547
4477void perf_event_fork(struct task_struct *task) 4548void perf_event_fork(struct task_struct *task)
@@ -4497,8 +4568,9 @@ struct perf_comm_event {
4497}; 4568};
4498 4569
4499static void perf_event_comm_output(struct perf_event *event, 4570static void perf_event_comm_output(struct perf_event *event,
4500 struct perf_comm_event *comm_event) 4571 void *data)
4501{ 4572{
4573 struct perf_comm_event *comm_event = data;
4502 struct perf_output_handle handle; 4574 struct perf_output_handle handle;
4503 struct perf_sample_data sample; 4575 struct perf_sample_data sample;
4504 int size = comm_event->event_id.header.size; 4576 int size = comm_event->event_id.header.size;
@@ -4525,39 +4597,16 @@ out:
4525 comm_event->event_id.header.size = size; 4597 comm_event->event_id.header.size = size;
4526} 4598}
4527 4599
4528static int perf_event_comm_match(struct perf_event *event) 4600static int perf_event_comm_match(struct perf_event *event,
4601 void *data __maybe_unused)
4529{ 4602{
4530 if (event->state < PERF_EVENT_STATE_INACTIVE) 4603 return event->attr.comm;
4531 return 0;
4532
4533 if (!event_filter_match(event))
4534 return 0;
4535
4536 if (event->attr.comm)
4537 return 1;
4538
4539 return 0;
4540}
4541
4542static void perf_event_comm_ctx(struct perf_event_context *ctx,
4543 struct perf_comm_event *comm_event)
4544{
4545 struct perf_event *event;
4546
4547 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4548 if (perf_event_comm_match(event))
4549 perf_event_comm_output(event, comm_event);
4550 }
4551} 4604}
4552 4605
4553static void perf_event_comm_event(struct perf_comm_event *comm_event) 4606static void perf_event_comm_event(struct perf_comm_event *comm_event)
4554{ 4607{
4555 struct perf_cpu_context *cpuctx;
4556 struct perf_event_context *ctx;
4557 char comm[TASK_COMM_LEN]; 4608 char comm[TASK_COMM_LEN];
4558 unsigned int size; 4609 unsigned int size;
4559 struct pmu *pmu;
4560 int ctxn;
4561 4610
4562 memset(comm, 0, sizeof(comm)); 4611 memset(comm, 0, sizeof(comm));
4563 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 4612 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4567,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4567 comm_event->comm_size = size; 4616 comm_event->comm_size = size;
4568 4617
4569 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4618 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4570 rcu_read_lock();
4571 list_for_each_entry_rcu(pmu, &pmus, entry) {
4572 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4573 if (cpuctx->unique_pmu != pmu)
4574 goto next;
4575 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4576 4619
4577 ctxn = pmu->task_ctx_nr; 4620 perf_event_aux(perf_event_comm_match,
4578 if (ctxn < 0) 4621 perf_event_comm_output,
4579 goto next; 4622 comm_event,
4580 4623 NULL);
4581 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4582 if (ctx)
4583 perf_event_comm_ctx(ctx, comm_event);
4584next:
4585 put_cpu_ptr(pmu->pmu_cpu_context);
4586 }
4587 rcu_read_unlock();
4588} 4624}
4589 4625
4590void perf_event_comm(struct task_struct *task) 4626void perf_event_comm(struct task_struct *task)
@@ -4593,6 +4629,7 @@ void perf_event_comm(struct task_struct *task)
4593 struct perf_event_context *ctx; 4629 struct perf_event_context *ctx;
4594 int ctxn; 4630 int ctxn;
4595 4631
4632 rcu_read_lock();
4596 for_each_task_context_nr(ctxn) { 4633 for_each_task_context_nr(ctxn) {
4597 ctx = task->perf_event_ctxp[ctxn]; 4634 ctx = task->perf_event_ctxp[ctxn];
4598 if (!ctx) 4635 if (!ctx)
@@ -4600,6 +4637,7 @@ void perf_event_comm(struct task_struct *task)
4600 4637
4601 perf_event_enable_on_exec(ctx); 4638 perf_event_enable_on_exec(ctx);
4602 } 4639 }
4640 rcu_read_unlock();
4603 4641
4604 if (!atomic_read(&nr_comm_events)) 4642 if (!atomic_read(&nr_comm_events))
4605 return; 4643 return;
@@ -4644,8 +4682,9 @@ struct perf_mmap_event {
4644}; 4682};
4645 4683
4646static void perf_event_mmap_output(struct perf_event *event, 4684static void perf_event_mmap_output(struct perf_event *event,
4647 struct perf_mmap_event *mmap_event) 4685 void *data)
4648{ 4686{
4687 struct perf_mmap_event *mmap_event = data;
4649 struct perf_output_handle handle; 4688 struct perf_output_handle handle;
4650 struct perf_sample_data sample; 4689 struct perf_sample_data sample;
4651 int size = mmap_event->event_id.header.size; 4690 int size = mmap_event->event_id.header.size;
@@ -4672,46 +4711,24 @@ out:
4672} 4711}
4673 4712
4674static int perf_event_mmap_match(struct perf_event *event, 4713static int perf_event_mmap_match(struct perf_event *event,
4675 struct perf_mmap_event *mmap_event, 4714 void *data)
4676 int executable)
4677{
4678 if (event->state < PERF_EVENT_STATE_INACTIVE)
4679 return 0;
4680
4681 if (!event_filter_match(event))
4682 return 0;
4683
4684 if ((!executable && event->attr.mmap_data) ||
4685 (executable && event->attr.mmap))
4686 return 1;
4687
4688 return 0;
4689}
4690
4691static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4692 struct perf_mmap_event *mmap_event,
4693 int executable)
4694{ 4715{
4695 struct perf_event *event; 4716 struct perf_mmap_event *mmap_event = data;
4717 struct vm_area_struct *vma = mmap_event->vma;
4718 int executable = vma->vm_flags & VM_EXEC;
4696 4719
4697 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4720 return (!executable && event->attr.mmap_data) ||
4698 if (perf_event_mmap_match(event, mmap_event, executable)) 4721 (executable && event->attr.mmap);
4699 perf_event_mmap_output(event, mmap_event);
4700 }
4701} 4722}
4702 4723
4703static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 4724static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4704{ 4725{
4705 struct perf_cpu_context *cpuctx;
4706 struct perf_event_context *ctx;
4707 struct vm_area_struct *vma = mmap_event->vma; 4726 struct vm_area_struct *vma = mmap_event->vma;
4708 struct file *file = vma->vm_file; 4727 struct file *file = vma->vm_file;
4709 unsigned int size; 4728 unsigned int size;
4710 char tmp[16]; 4729 char tmp[16];
4711 char *buf = NULL; 4730 char *buf = NULL;
4712 const char *name; 4731 const char *name;
4713 struct pmu *pmu;
4714 int ctxn;
4715 4732
4716 memset(tmp, 0, sizeof(tmp)); 4733 memset(tmp, 0, sizeof(tmp));
4717 4734
@@ -4734,7 +4751,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4734 } else { 4751 } else {
4735 if (arch_vma_name(mmap_event->vma)) { 4752 if (arch_vma_name(mmap_event->vma)) {
4736 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 4753 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4737 sizeof(tmp)); 4754 sizeof(tmp) - 1);
4755 tmp[sizeof(tmp) - 1] = '\0';
4738 goto got_name; 4756 goto got_name;
4739 } 4757 }
4740 4758
@@ -4761,29 +4779,15 @@ got_name:
4761 mmap_event->file_name = name; 4779 mmap_event->file_name = name;
4762 mmap_event->file_size = size; 4780 mmap_event->file_size = size;
4763 4781
4764 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4782 if (!(vma->vm_flags & VM_EXEC))
4765 4783 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
4766 rcu_read_lock();
4767 list_for_each_entry_rcu(pmu, &pmus, entry) {
4768 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4769 if (cpuctx->unique_pmu != pmu)
4770 goto next;
4771 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4772 vma->vm_flags & VM_EXEC);
4773 4784
4774 ctxn = pmu->task_ctx_nr; 4785 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4775 if (ctxn < 0)
4776 goto next;
4777 4786
4778 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4787 perf_event_aux(perf_event_mmap_match,
4779 if (ctx) { 4788 perf_event_mmap_output,
4780 perf_event_mmap_ctx(ctx, mmap_event, 4789 mmap_event,
4781 vma->vm_flags & VM_EXEC); 4790 NULL);
4782 }
4783next:
4784 put_cpu_ptr(pmu->pmu_cpu_context);
4785 }
4786 rcu_read_unlock();
4787 4791
4788 kfree(buf); 4792 kfree(buf);
4789} 4793}
@@ -5327,7 +5331,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
5327 5331
5328static int perf_swevent_init(struct perf_event *event) 5332static int perf_swevent_init(struct perf_event *event)
5329{ 5333{
5330 int event_id = event->attr.config; 5334 u64 event_id = event->attr.config;
5331 5335
5332 if (event->attr.type != PERF_TYPE_SOFTWARE) 5336 if (event->attr.type != PERF_TYPE_SOFTWARE)
5333 return -ENOENT; 5337 return -ENOENT;
@@ -5647,6 +5651,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
5647 event->attr.sample_period = NSEC_PER_SEC / freq; 5651 event->attr.sample_period = NSEC_PER_SEC / freq;
5648 hwc->sample_period = event->attr.sample_period; 5652 hwc->sample_period = event->attr.sample_period;
5649 local64_set(&hwc->period_left, hwc->sample_period); 5653 local64_set(&hwc->period_left, hwc->sample_period);
5654 hwc->last_period = hwc->sample_period;
5650 event->attr.freq = 0; 5655 event->attr.freq = 0;
5651 } 5656 }
5652} 5657}
@@ -5982,6 +5987,7 @@ skip_type:
5982 if (pmu->pmu_cpu_context) 5987 if (pmu->pmu_cpu_context)
5983 goto got_cpu_context; 5988 goto got_cpu_context;
5984 5989
5990 ret = -ENOMEM;
5985 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5991 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5986 if (!pmu->pmu_cpu_context) 5992 if (!pmu->pmu_cpu_context)
5987 goto free_dev; 5993 goto free_dev;
@@ -7509,12 +7515,5 @@ struct cgroup_subsys perf_subsys = {
7509 .css_free = perf_cgroup_css_free, 7515 .css_free = perf_cgroup_css_free,
7510 .exit = perf_cgroup_exit, 7516 .exit = perf_cgroup_exit,
7511 .attach = perf_cgroup_attach, 7517 .attach = perf_cgroup_attach,
7512
7513 /*
7514 * perf_event cgroup doesn't handle nesting correctly.
7515 * ctx->nr_cgroups adjustments should be propagated through the
7516 * cgroup hierarchy. Fix it and remove the following.
7517 */
7518 .broken_hierarchy = true,
7519}; 7518};
7520#endif /* CONFIG_CGROUP_PERF */ 7519#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
16 int page_order; /* allocation order */ 16 int page_order; /* allocation order */
17#endif 17#endif
18 int nr_pages; /* nr of data pages */ 18 int nr_pages; /* nr of data pages */
19 int writable; /* are we writable */ 19 int overwrite; /* can overwrite itself */
20 20
21 atomic_t poll; /* POLL_ for wakeups */ 21 atomic_t poll; /* POLL_ for wakeups */
22 22
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..cd55144270b5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, 18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head) 19 unsigned long offset, unsigned long head)
20{ 20{
21 unsigned long mask; 21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
22 23
23 if (!rb->writable) 24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
24 return true; 30 return true;
25 31
26 mask = perf_data_size(rb) - 1; 32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
27 39
28 offset = (offset - tail) & mask; 40 offset = (offset - tail) & mask;
29 head = (head - tail) & mask; 41 head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
212 rb->watermark = max_size / 2; 224 rb->watermark = max_size / 2;
213 225
214 if (flags & RING_BUFFER_WRITABLE) 226 if (flags & RING_BUFFER_WRITABLE)
215 rb->writable = 1; 227 rb->overwrite = 0;
228 else
229 rb->overwrite = 1;
216 230
217 atomic_set(&rb->refcount, 1); 231 atomic_set(&rb->refcount, 1);
218 232
@@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
312} 326}
313 327
314#else 328#else
329static int data_page_nr(struct ring_buffer *rb)
330{
331 return rb->nr_pages << page_order(rb);
332}
315 333
316struct page * 334struct page *
317perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 335perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
318{ 336{
319 if (pgoff > (1UL << page_order(rb))) 337 /* The '>' counts in the user page. */
338 if (pgoff > data_page_nr(rb))
320 return NULL; 339 return NULL;
321 340
322 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 341 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
336 int i, nr; 355 int i, nr;
337 356
338 rb = container_of(work, struct ring_buffer, work); 357 rb = container_of(work, struct ring_buffer, work);
339 nr = 1 << page_order(rb); 358 nr = data_page_nr(rb);
340 359
341 base = rb->user_page; 360 base = rb->user_page;
342 for (i = 0; i < nr + 1; i++) 361 /* The '<=' counts in the user page. */
362 for (i = 0; i <= nr; i++)
343 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 363 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
344 364
345 vfree(base); 365 vfree(base);
@@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
373 rb->user_page = all_buf; 393 rb->user_page = all_buf;
374 rb->data_pages[0] = all_buf + PAGE_SIZE; 394 rb->data_pages[0] = all_buf + PAGE_SIZE;
375 rb->page_order = ilog2(nr_pages); 395 rb->page_order = ilog2(nr_pages);
376 rb->nr_pages = 1; 396 rb->nr_pages = !!nr_pages;
377 397
378 ring_buffer_init(rb, watermark, flags); 398 ring_buffer_init(rb, watermark, flags);
379 399
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a567c8c7ef31..f3569747d629 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
75 struct arch_uprobe arch; 75 struct arch_uprobe arch;
76}; 76};
77 77
78struct return_instance {
79 struct uprobe *uprobe;
80 unsigned long func;
81 unsigned long orig_ret_vaddr; /* original return address */
82 bool chained; /* true, if instance is nested */
83
84 struct return_instance *next; /* keep as stack */
85};
86
78/* 87/*
79 * valid_vma: Verify if the specified vma is an executable vma 88 * valid_vma: Verify if the specified vma is an executable vma
80 * Relax restrictions while unregistering: vm_flags might have 89 * Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
173 return *insn == UPROBE_SWBP_INSN; 182 return *insn == UPROBE_SWBP_INSN;
174} 183}
175 184
176static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) 185/**
186 * is_trap_insn - check if instruction is breakpoint instruction.
187 * @insn: instruction to be checked.
188 * Default implementation of is_trap_insn
189 * Returns true if @insn is a breakpoint instruction.
190 *
191 * This function is needed for the case where an architecture has multiple
192 * trap instructions (like powerpc).
193 */
194bool __weak is_trap_insn(uprobe_opcode_t *insn)
195{
196 return is_swbp_insn(insn);
197}
198
199static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
177{ 200{
178 void *kaddr = kmap_atomic(page); 201 void *kaddr = kmap_atomic(page);
179 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); 202 memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
203 kunmap_atomic(kaddr);
204}
205
206static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
207{
208 void *kaddr = kmap_atomic(page);
209 memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
180 kunmap_atomic(kaddr); 210 kunmap_atomic(kaddr);
181} 211}
182 212
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
185 uprobe_opcode_t old_opcode; 215 uprobe_opcode_t old_opcode;
186 bool is_swbp; 216 bool is_swbp;
187 217
188 copy_opcode(page, vaddr, &old_opcode); 218 /*
219 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
220 * We do not check if it is any other 'trap variant' which could
221 * be conditional trap instruction such as the one powerpc supports.
222 *
223 * The logic is that we do not care if the underlying instruction
224 * is a trap variant; uprobes always wins over any other (gdb)
225 * breakpoint.
226 */
227 copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
189 is_swbp = is_swbp_insn(&old_opcode); 228 is_swbp = is_swbp_insn(&old_opcode);
190 229
191 if (is_swbp_insn(new_opcode)) { 230 if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
204 * Expect the breakpoint instruction to be the smallest size instruction for 243 * Expect the breakpoint instruction to be the smallest size instruction for
205 * the architecture. If an arch has variable length instruction and the 244 * the architecture. If an arch has variable length instruction and the
206 * breakpoint instruction is not of the smallest length instruction 245 * breakpoint instruction is not of the smallest length instruction
207 * supported by that architecture then we need to modify is_swbp_at_addr and 246 * supported by that architecture then we need to modify is_trap_at_addr and
208 * write_opcode accordingly. This would never be a problem for archs that 247 * write_opcode accordingly. This would never be a problem for archs that
209 * have fixed length instructions. 248 * have fixed length instructions.
210 */ 249 */
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
225 uprobe_opcode_t opcode) 264 uprobe_opcode_t opcode)
226{ 265{
227 struct page *old_page, *new_page; 266 struct page *old_page, *new_page;
228 void *vaddr_old, *vaddr_new;
229 struct vm_area_struct *vma; 267 struct vm_area_struct *vma;
230 int ret; 268 int ret;
231 269
@@ -246,15 +284,8 @@ retry:
246 284
247 __SetPageUptodate(new_page); 285 __SetPageUptodate(new_page);
248 286
249 /* copy the page now that we've got it stable */ 287 copy_highpage(new_page, old_page);
250 vaddr_old = kmap_atomic(old_page); 288 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
251 vaddr_new = kmap_atomic(new_page);
252
253 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
254 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
255
256 kunmap_atomic(vaddr_new);
257 kunmap_atomic(vaddr_old);
258 289
259 ret = anon_vma_prepare(vma); 290 ret = anon_vma_prepare(vma);
260 if (ret) 291 if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
477 unsigned long nbytes, loff_t offset) 508 unsigned long nbytes, loff_t offset)
478{ 509{
479 struct page *page; 510 struct page *page;
480 void *vaddr;
481 unsigned long off;
482 pgoff_t idx;
483
484 if (!filp)
485 return -EINVAL;
486 511
487 if (!mapping->a_ops->readpage) 512 if (!mapping->a_ops->readpage)
488 return -EIO; 513 return -EIO;
489
490 idx = offset >> PAGE_CACHE_SHIFT;
491 off = offset & ~PAGE_MASK;
492
493 /* 514 /*
494 * Ensure that the page that has the original instruction is 515 * Ensure that the page that has the original instruction is
495 * populated and in page-cache. 516 * populated and in page-cache.
496 */ 517 */
497 page = read_mapping_page(mapping, idx, filp); 518 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
498 if (IS_ERR(page)) 519 if (IS_ERR(page))
499 return PTR_ERR(page); 520 return PTR_ERR(page);
500 521
501 vaddr = kmap_atomic(page); 522 copy_from_page(page, offset, insn, nbytes);
502 memcpy(insn, vaddr + off, nbytes);
503 kunmap_atomic(vaddr);
504 page_cache_release(page); 523 page_cache_release(page);
505 524
506 return 0; 525 return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
550 goto out; 569 goto out;
551 570
552 ret = -ENOTSUPP; 571 ret = -ENOTSUPP;
553 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 572 if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
554 goto out; 573 goto out;
555 574
556 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); 575 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
758 down_write(&mm->mmap_sem); 777 down_write(&mm->mmap_sem);
759 vma = find_vma(mm, info->vaddr); 778 vma = find_vma(mm, info->vaddr);
760 if (!vma || !valid_vma(vma, is_register) || 779 if (!vma || !valid_vma(vma, is_register) ||
761 vma->vm_file->f_mapping->host != uprobe->inode) 780 file_inode(vma->vm_file) != uprobe->inode)
762 goto unlock; 781 goto unlock;
763 782
764 if (vma->vm_start > info->vaddr || 783 if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
828 struct uprobe *uprobe; 847 struct uprobe *uprobe;
829 int ret; 848 int ret;
830 849
850 /* Uprobe must have at least one set consumer */
851 if (!uc->handler && !uc->ret_handler)
852 return -EINVAL;
853
831 /* Racy, just to catch the obvious mistakes */ 854 /* Racy, just to catch the obvious mistakes */
832 if (offset > i_size_read(inode)) 855 if (offset > i_size_read(inode))
833 return -EINVAL; 856 return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
917 loff_t offset; 940 loff_t offset;
918 941
919 if (!valid_vma(vma, false) || 942 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode) 943 file_inode(vma->vm_file) != uprobe->inode)
921 continue; 944 continue;
922 945
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 946 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
1010 if (no_uprobe_events() || !valid_vma(vma, true)) 1033 if (no_uprobe_events() || !valid_vma(vma, true))
1011 return 0; 1034 return 0;
1012 1035
1013 inode = vma->vm_file->f_mapping->host; 1036 inode = file_inode(vma->vm_file);
1014 if (!inode) 1037 if (!inode)
1015 return 0; 1038 return 0;
1016 1039
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1041 struct inode *inode; 1064 struct inode *inode;
1042 struct rb_node *n; 1065 struct rb_node *n;
1043 1066
1044 inode = vma->vm_file->f_mapping->host; 1067 inode = file_inode(vma->vm_file);
1045 1068
1046 min = vaddr_to_offset(vma, start); 1069 min = vaddr_to_offset(vma, start);
1047 max = min + (end - start) - 1; 1070 max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
1114{ 1137{
1115 struct mm_struct *mm = current->mm; 1138 struct mm_struct *mm = current->mm;
1116 struct xol_area *area; 1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1117 1141
1118 area = mm->uprobes_state.xol_area; 1142 area = mm->uprobes_state.xol_area;
1119 if (area) 1143 if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
1131 if (!area->page) 1155 if (!area->page)
1132 goto free_bitmap; 1156 goto free_bitmap;
1133 1157
1158 /* allocate first slot of task's xol_area for the return probes */
1159 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1);
1134 init_waitqueue_head(&area->wq); 1162 init_waitqueue_head(&area->wq);
1163
1135 if (!xol_add_vma(area)) 1164 if (!xol_add_vma(area))
1136 return area; 1165 return area;
1137 1166
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe) 1245static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1217{ 1246{
1218 struct xol_area *area; 1247 struct xol_area *area;
1219 unsigned long offset;
1220 unsigned long xol_vaddr; 1248 unsigned long xol_vaddr;
1221 void *vaddr;
1222 1249
1223 area = get_xol_area(); 1250 area = get_xol_area();
1224 if (!area) 1251 if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1229 return 0; 1256 return 0;
1230 1257
1231 /* Initialize the slot */ 1258 /* Initialize the slot */
1232 offset = xol_vaddr & ~PAGE_MASK; 1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
1233 vaddr = kmap_atomic(area->page);
1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1235 kunmap_atomic(vaddr);
1236 /* 1260 /*
1237 * We probably need flush_icache_user_range() but it needs vma. 1261 * We probably need flush_icache_user_range() but it needs vma.
1238 * This should work on supported architectures too. 1262 * This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1298void uprobe_free_utask(struct task_struct *t) 1322void uprobe_free_utask(struct task_struct *t)
1299{ 1323{
1300 struct uprobe_task *utask = t->utask; 1324 struct uprobe_task *utask = t->utask;
1325 struct return_instance *ri, *tmp;
1301 1326
1302 if (!utask) 1327 if (!utask)
1303 return; 1328 return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
1305 if (utask->active_uprobe) 1330 if (utask->active_uprobe)
1306 put_uprobe(utask->active_uprobe); 1331 put_uprobe(utask->active_uprobe);
1307 1332
1333 ri = utask->return_instances;
1334 while (ri) {
1335 tmp = ri;
1336 ri = ri->next;
1337
1338 put_uprobe(tmp->uprobe);
1339 kfree(tmp);
1340 }
1341
1308 xol_free_insn_slot(t); 1342 xol_free_insn_slot(t);
1309 kfree(utask); 1343 kfree(utask);
1310 t->utask = NULL; 1344 t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
1333 return current->utask; 1367 return current->utask;
1334} 1368}
1335 1369
1370/*
1371 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr.
1373 *
1374 * Returns -1 in case the xol_area is not allocated.
1375 */
1376static unsigned long get_trampoline_vaddr(void)
1377{
1378 struct xol_area *area;
1379 unsigned long trampoline_vaddr = -1;
1380
1381 area = current->mm->uprobes_state.xol_area;
1382 smp_read_barrier_depends();
1383 if (area)
1384 trampoline_vaddr = area->vaddr;
1385
1386 return trampoline_vaddr;
1387}
1388
1389static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1390{
1391 struct return_instance *ri;
1392 struct uprobe_task *utask;
1393 unsigned long orig_ret_vaddr, trampoline_vaddr;
1394 bool chained = false;
1395
1396 if (!get_xol_area())
1397 return;
1398
1399 utask = get_utask();
1400 if (!utask)
1401 return;
1402
1403 if (utask->depth >= MAX_URETPROBE_DEPTH) {
1404 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1405 " nestedness limit pid/tgid=%d/%d\n",
1406 current->pid, current->tgid);
1407 return;
1408 }
1409
1410 ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
1411 if (!ri)
1412 goto fail;
1413
1414 trampoline_vaddr = get_trampoline_vaddr();
1415 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1416 if (orig_ret_vaddr == -1)
1417 goto fail;
1418
1419 /*
1420 * We don't want to keep trampoline address in stack, rather keep the
1421 * original return address of first caller thru all the consequent
1422 * instances. This also makes breakpoint unwrapping easier.
1423 */
1424 if (orig_ret_vaddr == trampoline_vaddr) {
1425 if (!utask->return_instances) {
1426 /*
1427 * This situation is not possible. Likely we have an
1428 * attack from user-space.
1429 */
1430 pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
1431 current->pid, current->tgid);
1432 goto fail;
1433 }
1434
1435 chained = true;
1436 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1437 }
1438
1439 atomic_inc(&uprobe->ref);
1440 ri->uprobe = uprobe;
1441 ri->func = instruction_pointer(regs);
1442 ri->orig_ret_vaddr = orig_ret_vaddr;
1443 ri->chained = chained;
1444
1445 utask->depth++;
1446
1447 /* add instance to the stack */
1448 ri->next = utask->return_instances;
1449 utask->return_instances = ri;
1450
1451 return;
1452
1453 fail:
1454 kfree(ri);
1455}
1456
1336/* Prepare to single-step probed instruction out of line. */ 1457/* Prepare to single-step probed instruction out of line. */
1337static int 1458static int
1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) 1459pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1431 clear_bit(MMF_HAS_UPROBES, &mm->flags); 1552 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1432} 1553}
1433 1554
1434static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) 1555static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1435{ 1556{
1436 struct page *page; 1557 struct page *page;
1437 uprobe_opcode_t opcode; 1558 uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1449 if (result < 0) 1570 if (result < 0)
1450 return result; 1571 return result;
1451 1572
1452 copy_opcode(page, vaddr, &opcode); 1573 copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1453 put_page(page); 1574 put_page(page);
1454 out: 1575 out:
1455 return is_swbp_insn(&opcode); 1576 /* This needs to return true for any variant of the trap insn */
1577 return is_trap_insn(&opcode);
1456} 1578}
1457 1579
1458static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 1580static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1465 vma = find_vma(mm, bp_vaddr); 1587 vma = find_vma(mm, bp_vaddr);
1466 if (vma && vma->vm_start <= bp_vaddr) { 1588 if (vma && vma->vm_start <= bp_vaddr) {
1467 if (valid_vma(vma, false)) { 1589 if (valid_vma(vma, false)) {
1468 struct inode *inode = vma->vm_file->f_mapping->host; 1590 struct inode *inode = file_inode(vma->vm_file);
1469 loff_t offset = vaddr_to_offset(vma, bp_vaddr); 1591 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1470 1592
1471 uprobe = find_uprobe(inode, offset); 1593 uprobe = find_uprobe(inode, offset);
1472 } 1594 }
1473 1595
1474 if (!uprobe) 1596 if (!uprobe)
1475 *is_swbp = is_swbp_at_addr(mm, bp_vaddr); 1597 *is_swbp = is_trap_at_addr(mm, bp_vaddr);
1476 } else { 1598 } else {
1477 *is_swbp = -EFAULT; 1599 *is_swbp = -EFAULT;
1478 } 1600 }
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{ 1610{
1489 struct uprobe_consumer *uc; 1611 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE; 1612 int remove = UPROBE_HANDLER_REMOVE;
1613 bool need_prep = false; /* prepare return uprobe, when needed */
1491 1614
1492 down_read(&uprobe->register_rwsem); 1615 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) { 1616 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs); 1617 int rc = 0;
1618
1619 if (uc->handler) {
1620 rc = uc->handler(uc, regs);
1621 WARN(rc & ~UPROBE_HANDLER_MASK,
1622 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1623 }
1624
1625 if (uc->ret_handler)
1626 need_prep = true;
1495 1627
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc; 1628 remove &= rc;
1499 } 1629 }
1500 1630
1631 if (need_prep && !remove)
1632 prepare_uretprobe(uprobe, regs); /* put bp at return */
1633
1501 if (remove && uprobe->consumers) { 1634 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe)); 1635 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm); 1636 unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1505 up_read(&uprobe->register_rwsem); 1638 up_read(&uprobe->register_rwsem);
1506} 1639}
1507 1640
1641static void
1642handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1643{
1644 struct uprobe *uprobe = ri->uprobe;
1645 struct uprobe_consumer *uc;
1646
1647 down_read(&uprobe->register_rwsem);
1648 for (uc = uprobe->consumers; uc; uc = uc->next) {
1649 if (uc->ret_handler)
1650 uc->ret_handler(uc, ri->func, regs);
1651 }
1652 up_read(&uprobe->register_rwsem);
1653}
1654
1655static bool handle_trampoline(struct pt_regs *regs)
1656{
1657 struct uprobe_task *utask;
1658 struct return_instance *ri, *tmp;
1659 bool chained;
1660
1661 utask = current->utask;
1662 if (!utask)
1663 return false;
1664
1665 ri = utask->return_instances;
1666 if (!ri)
1667 return false;
1668
1669 /*
1670 * TODO: we should throw out return_instance's invalidated by
1671 * longjmp(), currently we assume that the probed function always
1672 * returns.
1673 */
1674 instruction_pointer_set(regs, ri->orig_ret_vaddr);
1675
1676 for (;;) {
1677 handle_uretprobe_chain(ri, regs);
1678
1679 chained = ri->chained;
1680 put_uprobe(ri->uprobe);
1681
1682 tmp = ri;
1683 ri = ri->next;
1684 kfree(tmp);
1685
1686 if (!chained)
1687 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri);
1692 }
1693
1694 utask->return_instances = ri;
1695
1696 return true;
1697}
1698
1508/* 1699/*
1509 * Run handler and ask thread to singlestep. 1700 * Run handler and ask thread to singlestep.
1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1701 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
1516 int uninitialized_var(is_swbp); 1707 int uninitialized_var(is_swbp);
1517 1708
1518 bp_vaddr = uprobe_get_swbp_addr(regs); 1709 bp_vaddr = uprobe_get_swbp_addr(regs);
1519 uprobe = find_active_uprobe(bp_vaddr, &is_swbp); 1710 if (bp_vaddr == get_trampoline_vaddr()) {
1711 if (handle_trampoline(regs))
1712 return;
1713
1714 pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1715 current->pid, current->tgid);
1716 }
1520 1717
1718 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1521 if (!uprobe) { 1719 if (!uprobe) {
1522 if (is_swbp > 0) { 1720 if (is_swbp > 0) {
1523 /* No matching uprobe; signal SIGTRAP. */ 1721 /* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
1616 */ 1814 */
1617int uprobe_pre_sstep_notifier(struct pt_regs *regs) 1815int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1618{ 1816{
1619 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags)) 1817 if (!current->mm)
1818 return 0;
1819
1820 if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
1821 (!current->utask || !current->utask->return_instances))
1620 return 0; 1822 return 0;
1621 1823
1622 set_thread_flag(TIF_UPROBE); 1824 set_thread_flag(TIF_UPROBE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca9935..af2eb3cbd499 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
835 /* 835 /*
836 * Make sure we are holding no locks: 836 * Make sure we are holding no locks:
837 */ 837 */
838 debug_check_no_locks_held(); 838 debug_check_no_locks_held(tsk);
839 /* 839 /*
840 * We can do this unlocked here. The futex code uses this flag 840 * We can do this unlocked here. The futex code uses this flag
841 * just to verify whether the pi state cleanup has been done 841 * just to verify whether the pi state cleanup has been done
@@ -847,7 +847,7 @@ void do_exit(long code)
847 exit_io_context(tsk); 847 exit_io_context(tsk);
848 848
849 if (tsk->splice_pipe) 849 if (tsk->splice_pipe)
850 __free_pipe_info(tsk->splice_pipe); 850 free_pipe_info(tsk->splice_pipe);
851 851
852 if (tsk->task_frag.page) 852 if (tsk->task_frag.page)
853 put_page(tsk->task_frag.page); 853 put_page(tsk->task_frag.page);
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1629 } 1629 }
1630 1630
1631 put_pid(pid); 1631 put_pid(pid);
1632
1633 /* avoid REGPARM breakage on x86: */
1634 asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1635 return ret; 1632 return ret;
1636} 1633}
1637 1634
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1669 ret = do_wait(&wo); 1666 ret = do_wait(&wo);
1670 put_pid(pid); 1667 put_pid(pid);
1671 1668
1672 /* avoid REGPARM breakage on x86: */
1673 asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1674 return ret; 1669 return ret;
1675} 1670}
1676 1671
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf76..67460b93b1a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) 44 if (main_extable_sort_needed) {
45 pr_notice("Sorting __ex_table...\n");
45 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
46 else 47 }
47 pr_notice("__ex_table already sorted, skipping sort\n");
48} 48}
49 49
50/* Given an address, look for it in the exception tables. */ 50/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d324d5e3..987b28a1f01b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
70#include <linux/khugepaged.h> 70#include <linux/khugepaged.h>
71#include <linux/signalfd.h> 71#include <linux/signalfd.h>
72#include <linux/uprobes.h> 72#include <linux/uprobes.h>
73#include <linux/aio.h>
73 74
74#include <asm/pgtable.h> 75#include <asm/pgtable.h>
75#include <asm/pgalloc.h> 76#include <asm/pgalloc.h>
@@ -1233,7 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233 1234
1234 p->utime = p->stime = p->gtime = 0; 1235 p->utime = p->stime = p->gtime = 0;
1235 p->utimescaled = p->stimescaled = 0; 1236 p->utimescaled = p->stimescaled = 0;
1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1237#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1237 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1238 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1238#endif 1239#endif
1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1240#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1303 p->memcg_batch.do_batch = 0; 1304 p->memcg_batch.do_batch = 0;
1304 p->memcg_batch.memcg = NULL; 1305 p->memcg_batch.memcg = NULL;
1305#endif 1306#endif
1307#ifdef CONFIG_BCACHE
1308 p->sequential_io = 0;
1309 p->sequential_io_avg = 0;
1310#endif
1306 1311
1307 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1308 sched_fork(p); 1313 sched_fork(p);
@@ -1677,10 +1682,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1677 int, tls_val) 1682 int, tls_val)
1678#endif 1683#endif
1679{ 1684{
1680 long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); 1685 return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
1681 asmlinkage_protect(5, ret, clone_flags, newsp,
1682 parent_tidptr, child_tidptr, tls_val);
1683 return ret;
1684} 1686}
1685#endif 1687#endif
1686 1688
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..fd4b13b131f8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
64{ 64{
65 65
66 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
66 .clock_base = 67 .clock_base =
67 { 68 {
68 { 69 {
@@ -83,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
83 .get_time = &ktime_get_boottime, 84 .get_time = &ktime_get_boottime,
84 .resolution = KTIME_LOW_RES, 85 .resolution = KTIME_LOW_RES,
85 }, 86 },
87 {
88 .index = HRTIMER_BASE_TAI,
89 .clockid = CLOCK_TAI,
90 .get_time = &ktime_get_clocktai,
91 .resolution = KTIME_LOW_RES,
92 },
86 } 93 }
87}; 94};
88 95
@@ -90,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
90 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 97 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
91 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 98 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
92 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 99 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
100 [CLOCK_TAI] = HRTIMER_BASE_TAI,
93}; 101};
94 102
95static inline int hrtimer_clockid_to_base(clockid_t clock_id) 103static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -106,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
106{ 114{
107 ktime_t xtim, mono, boot; 115 ktime_t xtim, mono, boot;
108 struct timespec xts, tom, slp; 116 struct timespec xts, tom, slp;
117 s32 tai_offset;
109 118
110 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); 119 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
120 tai_offset = timekeeping_get_tai_offset();
111 121
112 xtim = timespec_to_ktime(xts); 122 xtim = timespec_to_ktime(xts);
113 mono = ktime_add(xtim, timespec_to_ktime(tom)); 123 mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -115,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
115 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 125 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
116 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 126 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
117 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; 127 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
128 base->clock_base[HRTIMER_BASE_TAI].softirq_time =
129 ktime_add(xtim, ktime_set(tai_offset, 0));
118} 130}
119 131
120/* 132/*
@@ -160,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
160 */ 172 */
161static int hrtimer_get_target(int this_cpu, int pinned) 173static int hrtimer_get_target(int this_cpu, int pinned)
162{ 174{
163#ifdef CONFIG_NO_HZ 175#ifdef CONFIG_NO_HZ_COMMON
164 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) 176 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
165 return get_nohz_timer_target(); 177 return get_nohz_timer_target();
166#endif 178#endif
@@ -275,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
275 } else { 287 } else {
276 unsigned long rem = do_div(nsec, NSEC_PER_SEC); 288 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
277 289
290 /* Make sure nsec fits into long */
291 if (unlikely(nsec > KTIME_SEC_MAX))
292 return (ktime_t){ .tv64 = KTIME_MAX };
293
278 tmp = ktime_set((long)nsec, rem); 294 tmp = ktime_set((long)nsec, rem);
279 } 295 }
280 296
@@ -651,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
651{ 667{
652 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 668 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
653 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 669 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
670 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
654 671
655 return ktime_get_update_offsets(offs_real, offs_boot); 672 return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
656} 673}
657 674
658/* 675/*
@@ -1010,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1010 * @timer: the timer to be added 1027 * @timer: the timer to be added
1011 * @tim: expiry time 1028 * @tim: expiry time
1012 * @delta_ns: "slack" range for the timer 1029 * @delta_ns: "slack" range for the timer
1013 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 1030 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1031 * relative (HRTIMER_MODE_REL)
1014 * 1032 *
1015 * Returns: 1033 * Returns:
1016 * 0 on success 1034 * 0 on success
@@ -1027,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1027 * hrtimer_start - (re)start an hrtimer on the current CPU 1045 * hrtimer_start - (re)start an hrtimer on the current CPU
1028 * @timer: the timer to be added 1046 * @timer: the timer to be added
1029 * @tim: expiry time 1047 * @tim: expiry time
1030 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 1048 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1049 * relative (HRTIMER_MODE_REL)
1031 * 1050 *
1032 * Returns: 1051 * Returns:
1033 * 0 on success 1052 * 0 on success
@@ -1106,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1106} 1125}
1107EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 1126EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1108 1127
1109#ifdef CONFIG_NO_HZ 1128#ifdef CONFIG_NO_HZ_COMMON
1110/** 1129/**
1111 * hrtimer_get_next_event - get the time until next expiry event 1130 * hrtimer_get_next_event - get the time until next expiry event
1112 * 1131 *
@@ -1309,6 +1328,8 @@ retry:
1309 1328
1310 expires = ktime_sub(hrtimer_get_expires(timer), 1329 expires = ktime_sub(hrtimer_get_expires(timer),
1311 base->offset); 1330 base->offset);
1331 if (expires.tv64 < 0)
1332 expires.tv64 = KTIME_MAX;
1312 if (expires.tv64 < expires_next.tv64) 1333 if (expires.tv64 < expires_next.tv64)
1313 expires_next = expires; 1334 expires_next = expires;
1314 break; 1335 break;
@@ -1642,8 +1663,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1642 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1663 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1643 int i; 1664 int i;
1644 1665
1645 raw_spin_lock_init(&cpu_base->lock);
1646
1647 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1666 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1648 cpu_base->clock_base[i].cpu_base = cpu_base; 1667 cpu_base->clock_base[i].cpu_base = cpu_base;
1649 timerqueue_init_head(&cpu_base->clock_base[i].active); 1668 timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c379..54a4d5223238 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 143 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node. 144 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping 145 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain 146 * @first_irq: first number of irq block assigned to the domain,
147 * pass zero to assign irqs on-the-fly. This will result in a
148 * linear IRQ domain so it is important to use irq_create_mapping()
149 * for each used IRQ, especially when SPARSE_IRQ is enabled.
147 * @ops: map/unmap domain callbacks 150 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer 151 * @host_data: Controller private data pointer
149 * 152 *
@@ -191,6 +194,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
191 /* A linear domain is the default */ 194 /* A linear domain is the default */
192 return irq_domain_add_linear(of_node, size, ops, host_data); 195 return irq_domain_add_linear(of_node, size, ops, host_data);
193} 196}
197EXPORT_SYMBOL_GPL(irq_domain_add_simple);
194 198
195/** 199/**
196 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. 200 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
@@ -397,11 +401,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain,
397 while (count--) { 401 while (count--) {
398 int irq = irq_base + count; 402 int irq = irq_base + count;
399 struct irq_data *irq_data = irq_get_irq_data(irq); 403 struct irq_data *irq_data = irq_get_irq_data(irq);
400 irq_hw_number_t hwirq = irq_data->hwirq; 404 irq_hw_number_t hwirq;
401 405
402 if (WARN_ON(!irq_data || irq_data->domain != domain)) 406 if (WARN_ON(!irq_data || irq_data->domain != domain))
403 continue; 407 continue;
404 408
409 hwirq = irq_data->hwirq;
405 irq_set_status_flags(irq, IRQ_NOREQUEST); 410 irq_set_status_flags(irq, IRQ_NOREQUEST);
406 411
407 /* remove chip and handler */ 412 /* remove chip and handler */
@@ -462,9 +467,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
462 if (domain->ops->map) { 467 if (domain->ops->map) {
463 ret = domain->ops->map(domain, virq, hwirq); 468 ret = domain->ops->map(domain, virq, hwirq);
464 if (ret != 0) { 469 if (ret != 0) {
465 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", 470 /*
466 virq, hwirq, ret); 471 * If map() returns -EPERM, this interrupt is protected
467 WARN_ON(1); 472 * by the firmware or some other service and shall not
473 * be mapped.
474 *
475 * Since on some platforms we blindly try to map everything
476 * we end up with a log full of backtraces.
477 *
478 * So instead, we silently fail on -EPERM, it is the
479 * responsibility of the PIC driver to display a relevant
480 * message if needed.
481 */
482 if (ret != -EPERM) {
483 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
484 virq, hwirq, ret);
485 WARN_ON(1);
486 }
468 irq_data->domain = NULL; 487 irq_data->domain = NULL;
469 irq_data->hwirq = 0; 488 irq_data->hwirq = 0;
470 goto err_unmap; 489 goto err_unmap;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 397db02209ed..19ed5c425c3b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
76static ssize_t write_irq_affinity(int type, struct file *file, 76static ssize_t write_irq_affinity(int type, struct file *file,
77 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
78{ 78{
79 unsigned int irq = (int)(long)PDE(file_inode(file))->data; 79 unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
80 cpumask_var_t new_value; 80 cpumask_var_t new_value;
81 int err; 81 int err;
82 82
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
131 131
132static int irq_affinity_proc_open(struct inode *inode, struct file *file) 132static int irq_affinity_proc_open(struct inode *inode, struct file *file)
133{ 133{
134 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 134 return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
135} 135}
136 136
137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) 137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); 139 return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
140} 140}
141 141
142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) 142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
143{ 143{
144 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); 144 return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
145} 145}
146 146
147static const struct file_operations irq_affinity_proc_fops = { 147static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
212 212
213static int default_affinity_open(struct inode *inode, struct file *file) 213static int default_affinity_open(struct inode *inode, struct file *file)
214{ 214{
215 return single_open(file, default_affinity_show, PDE(inode)->data); 215 return single_open(file, default_affinity_show, PDE_DATA(inode));
216} 216}
217 217
218static const struct file_operations default_affinity_proc_fops = { 218static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
233 233
234static int irq_node_proc_open(struct inode *inode, struct file *file) 234static int irq_node_proc_open(struct inode *inode, struct file *file)
235{ 235{
236 return single_open(file, irq_node_proc_show, PDE(inode)->data); 236 return single_open(file, irq_node_proc_show, PDE_DATA(inode));
237} 237}
238 238
239static const struct file_operations irq_node_proc_fops = { 239static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
256 256
257static int irq_spurious_proc_open(struct inode *inode, struct file *file) 257static int irq_spurious_proc_open(struct inode *inode, struct file *file)
258{ 258{
259 return single_open(file, irq_spurious_proc_show, PDE(inode)->data); 259 return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
260} 260}
261 261
262static const struct file_operations irq_spurious_proc_fops = { 262static const struct file_operations irq_spurious_proc_fops = {
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
366 366
367void unregister_handler_proc(unsigned int irq, struct irqaction *action) 367void unregister_handler_proc(unsigned int irq, struct irqaction *action)
368{ 368{
369 if (action->dir) { 369 proc_remove(action->dir);
370 struct irq_desc *desc = irq_to_desc(irq);
371
372 remove_proc_entry(action->dir->name, desc->dir);
373 }
374} 370}
375 371
376static void register_default_affinity_proc(void) 372static void register_default_affinity_proc(void)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba529..3127ad52cdb2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr)
84 84
85/* 85/*
86 * Expand a compressed symbol data into the resulting uncompressed string, 86 * Expand a compressed symbol data into the resulting uncompressed string,
87 * if uncompressed string is too long (>= maxlen), it will be truncated,
87 * given the offset to where the symbol is in the compressed stream. 88 * given the offset to where the symbol is in the compressed stream.
88 */ 89 */
89static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 90static unsigned int kallsyms_expand_symbol(unsigned int off,
91 char *result, size_t maxlen)
90{ 92{
91 int len, skipped_first = 0; 93 int len, skipped_first = 0;
92 const u8 *tptr, *data; 94 const u8 *tptr, *data;
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
113 115
114 while (*tptr) { 116 while (*tptr) {
115 if (skipped_first) { 117 if (skipped_first) {
118 if (maxlen <= 1)
119 goto tail;
116 *result = *tptr; 120 *result = *tptr;
117 result++; 121 result++;
122 maxlen--;
118 } else 123 } else
119 skipped_first = 1; 124 skipped_first = 1;
120 tptr++; 125 tptr++;
121 } 126 }
122 } 127 }
123 128
124 *result = '\0'; 129tail:
130 if (maxlen)
131 *result = '\0';
125 132
126 /* Return to offset to the next symbol. */ 133 /* Return to offset to the next symbol. */
127 return off; 134 return off;
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name)
176 unsigned int off; 183 unsigned int off;
177 184
178 for (i = 0, off = 0; i < kallsyms_num_syms; i++) { 185 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
179 off = kallsyms_expand_symbol(off, namebuf); 186 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
180 187
181 if (strcmp(namebuf, name) == 0) 188 if (strcmp(namebuf, name) == 0)
182 return kallsyms_addresses[i]; 189 return kallsyms_addresses[i];
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
195 int ret; 202 int ret;
196 203
197 for (i = 0, off = 0; i < kallsyms_num_syms; i++) { 204 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
198 off = kallsyms_expand_symbol(off, namebuf); 205 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
199 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); 206 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
200 if (ret != 0) 207 if (ret != 0)
201 return ret; 208 return ret;
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr,
294 301
295 pos = get_symbol_pos(addr, symbolsize, offset); 302 pos = get_symbol_pos(addr, symbolsize, offset);
296 /* Grab name */ 303 /* Grab name */
297 kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); 304 kallsyms_expand_symbol(get_symbol_offset(pos),
305 namebuf, KSYM_NAME_LEN);
298 if (modname) 306 if (modname)
299 *modname = NULL; 307 *modname = NULL;
300 return namebuf; 308 return namebuf;
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
315 323
316 pos = get_symbol_pos(addr, NULL, NULL); 324 pos = get_symbol_pos(addr, NULL, NULL);
317 /* Grab name */ 325 /* Grab name */
318 kallsyms_expand_symbol(get_symbol_offset(pos), symname); 326 kallsyms_expand_symbol(get_symbol_offset(pos),
327 symname, KSYM_NAME_LEN);
319 return 0; 328 return 0;
320 } 329 }
321 /* See if it's in a module. */ 330 /* See if it's in a module. */
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
333 342
334 pos = get_symbol_pos(addr, size, offset); 343 pos = get_symbol_pos(addr, size, offset);
335 /* Grab name */ 344 /* Grab name */
336 kallsyms_expand_symbol(get_symbol_offset(pos), name); 345 kallsyms_expand_symbol(get_symbol_offset(pos),
346 name, KSYM_NAME_LEN);
337 modname[0] = '\0'; 347 modname[0] = '\0';
338 return 0; 348 return 0;
339 } 349 }
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
463 473
464 iter->type = kallsyms_get_symbol_type(off); 474 iter->type = kallsyms_get_symbol_type(off);
465 475
466 off = kallsyms_expand_symbol(off, iter->name); 476 off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
467 477
468 return off - iter->nameoff; 478 return off - iter->nameoff;
469} 479}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..59f7b55ba745 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = { 57struct resource crashk_low_res = {
58 .name = "Crash kernel low", 58 .name = "Crash kernel",
59 .start = 0, 59 .start = 0,
60 .end = 0, 60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image,
786 struct kexec_segment *segment) 786 struct kexec_segment *segment)
787{ 787{
788 unsigned long maddr; 788 unsigned long maddr;
789 unsigned long ubytes, mbytes; 789 size_t ubytes, mbytes;
790 int result; 790 int result;
791 unsigned char __user *buf; 791 unsigned char __user *buf;
792 792
@@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image,
819 /* Start with a clear page */ 819 /* Start with a clear page */
820 clear_page(ptr); 820 clear_page(ptr);
821 ptr += maddr & ~PAGE_MASK; 821 ptr += maddr & ~PAGE_MASK;
822 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 822 mchunk = min_t(size_t, mbytes,
823 if (mchunk > mbytes) 823 PAGE_SIZE - (maddr & ~PAGE_MASK));
824 mchunk = mbytes; 824 uchunk = min(ubytes, mchunk);
825
826 uchunk = mchunk;
827 if (uchunk > ubytes)
828 uchunk = ubytes;
829 825
830 result = copy_from_user(ptr, buf, uchunk); 826 result = copy_from_user(ptr, buf, uchunk);
831 kunmap(page); 827 kunmap(page);
@@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
850 * We do things a page at a time for the sake of kmap. 846 * We do things a page at a time for the sake of kmap.
851 */ 847 */
852 unsigned long maddr; 848 unsigned long maddr;
853 unsigned long ubytes, mbytes; 849 size_t ubytes, mbytes;
854 int result; 850 int result;
855 unsigned char __user *buf; 851 unsigned char __user *buf;
856 852
@@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image,
871 } 867 }
872 ptr = kmap(page); 868 ptr = kmap(page);
873 ptr += maddr & ~PAGE_MASK; 869 ptr += maddr & ~PAGE_MASK;
874 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 870 mchunk = min_t(size_t, mbytes,
875 if (mchunk > mbytes) 871 PAGE_SIZE - (maddr & ~PAGE_MASK));
876 mchunk = mbytes; 872 uchunk = min(ubytes, mchunk);
877 873 if (mchunk > uchunk) {
878 uchunk = mchunk;
879 if (uchunk > ubytes) {
880 uchunk = ubytes;
881 /* Zero the trailing part of the page */ 874 /* Zero the trailing part of the page */
882 memset(ptr + uchunk, 0, mchunk - uchunk); 875 memset(ptr + uchunk, 0, mchunk - uchunk);
883 } 876 }
@@ -1118,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
1118{ 1111{
1119 unsigned long addr; 1112 unsigned long addr;
1120 1113
1121 for (addr = begin; addr < end; addr += PAGE_SIZE) { 1114 for (addr = begin; addr < end; addr += PAGE_SIZE)
1122 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); 1115 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
1123 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1124 free_page((unsigned long)__va(addr));
1125 totalram_pages++;
1126 }
1127} 1116}
1128 1117
1129int crash_shrink_memory(unsigned long new_size) 1118int crash_shrink_memory(unsigned long new_size)
@@ -1368,35 +1357,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
1368 return 0; 1357 return 0;
1369} 1358}
1370 1359
1360#define SUFFIX_HIGH 0
1361#define SUFFIX_LOW 1
1362#define SUFFIX_NULL 2
1363static __initdata char *suffix_tbl[] = {
1364 [SUFFIX_HIGH] = ",high",
1365 [SUFFIX_LOW] = ",low",
1366 [SUFFIX_NULL] = NULL,
1367};
1368
1371/* 1369/*
1372 * That function is the entry point for command line parsing and should be 1370 * That function parses "suffix" crashkernel command lines like
1373 * called from the arch-specific code. 1371 *
1372 * crashkernel=size,[high|low]
1373 *
1374 * It returns 0 on success and -EINVAL on failure.
1374 */ 1375 */
1376static int __init parse_crashkernel_suffix(char *cmdline,
1377 unsigned long long *crash_size,
1378 unsigned long long *crash_base,
1379 const char *suffix)
1380{
1381 char *cur = cmdline;
1382
1383 *crash_size = memparse(cmdline, &cur);
1384 if (cmdline == cur) {
1385 pr_warn("crashkernel: memory value expected\n");
1386 return -EINVAL;
1387 }
1388
1389 /* check with suffix */
1390 if (strncmp(cur, suffix, strlen(suffix))) {
1391 pr_warn("crashkernel: unrecognized char\n");
1392 return -EINVAL;
1393 }
1394 cur += strlen(suffix);
1395 if (*cur != ' ' && *cur != '\0') {
1396 pr_warn("crashkernel: unrecognized char\n");
1397 return -EINVAL;
1398 }
1399
1400 return 0;
1401}
1402
1403static __init char *get_last_crashkernel(char *cmdline,
1404 const char *name,
1405 const char *suffix)
1406{
1407 char *p = cmdline, *ck_cmdline = NULL;
1408
1409 /* find crashkernel and use the last one if there are more */
1410 p = strstr(p, name);
1411 while (p) {
1412 char *end_p = strchr(p, ' ');
1413 char *q;
1414
1415 if (!end_p)
1416 end_p = p + strlen(p);
1417
1418 if (!suffix) {
1419 int i;
1420
1421 /* skip the one with any known suffix */
1422 for (i = 0; suffix_tbl[i]; i++) {
1423 q = end_p - strlen(suffix_tbl[i]);
1424 if (!strncmp(q, suffix_tbl[i],
1425 strlen(suffix_tbl[i])))
1426 goto next;
1427 }
1428 ck_cmdline = p;
1429 } else {
1430 q = end_p - strlen(suffix);
1431 if (!strncmp(q, suffix, strlen(suffix)))
1432 ck_cmdline = p;
1433 }
1434next:
1435 p = strstr(p+1, name);
1436 }
1437
1438 if (!ck_cmdline)
1439 return NULL;
1440
1441 return ck_cmdline;
1442}
1443
1375static int __init __parse_crashkernel(char *cmdline, 1444static int __init __parse_crashkernel(char *cmdline,
1376 unsigned long long system_ram, 1445 unsigned long long system_ram,
1377 unsigned long long *crash_size, 1446 unsigned long long *crash_size,
1378 unsigned long long *crash_base, 1447 unsigned long long *crash_base,
1379 const char *name) 1448 const char *name,
1449 const char *suffix)
1380{ 1450{
1381 char *p = cmdline, *ck_cmdline = NULL;
1382 char *first_colon, *first_space; 1451 char *first_colon, *first_space;
1452 char *ck_cmdline;
1383 1453
1384 BUG_ON(!crash_size || !crash_base); 1454 BUG_ON(!crash_size || !crash_base);
1385 *crash_size = 0; 1455 *crash_size = 0;
1386 *crash_base = 0; 1456 *crash_base = 0;
1387 1457
1388 /* find crashkernel and use the last one if there are more */ 1458 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1389 p = strstr(p, name);
1390 while (p) {
1391 ck_cmdline = p;
1392 p = strstr(p+1, name);
1393 }
1394 1459
1395 if (!ck_cmdline) 1460 if (!ck_cmdline)
1396 return -EINVAL; 1461 return -EINVAL;
1397 1462
1398 ck_cmdline += strlen(name); 1463 ck_cmdline += strlen(name);
1399 1464
1465 if (suffix)
1466 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1467 crash_base, suffix);
1400 /* 1468 /*
1401 * if the commandline contains a ':', then that's the extended 1469 * if the commandline contains a ':', then that's the extended
1402 * syntax -- if not, it must be the classic syntax 1470 * syntax -- if not, it must be the classic syntax
@@ -1413,13 +1481,26 @@ static int __init __parse_crashkernel(char *cmdline,
1413 return 0; 1481 return 0;
1414} 1482}
1415 1483
1484/*
1485 * That function is the entry point for command line parsing and should be
1486 * called from the arch-specific code.
1487 */
1416int __init parse_crashkernel(char *cmdline, 1488int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram, 1489 unsigned long long system_ram,
1418 unsigned long long *crash_size, 1490 unsigned long long *crash_size,
1419 unsigned long long *crash_base) 1491 unsigned long long *crash_base)
1420{ 1492{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1493 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel="); 1494 "crashkernel=", NULL);
1495}
1496
1497int __init parse_crashkernel_high(char *cmdline,
1498 unsigned long long system_ram,
1499 unsigned long long *crash_size,
1500 unsigned long long *crash_base)
1501{
1502 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1503 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1423} 1504}
1424 1505
1425int __init parse_crashkernel_low(char *cmdline, 1506int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1509,7 @@ int __init parse_crashkernel_low(char *cmdline,
1428 unsigned long long *crash_base) 1509 unsigned long long *crash_base)
1429{ 1510{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1511 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low="); 1512 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1432} 1513}
1433 1514
1434static void update_vmcoreinfo_note(void) 1515static void update_vmcoreinfo_note(void)
@@ -1452,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
1452{ 1533{
1453 va_list args; 1534 va_list args;
1454 char buf[0x50]; 1535 char buf[0x50];
1455 int r; 1536 size_t r;
1456 1537
1457 va_start(args, fmt); 1538 va_start(args, fmt);
1458 r = vsnprintf(buf, sizeof(buf), fmt, args); 1539 r = vsnprintf(buf, sizeof(buf), fmt, args);
1459 va_end(args); 1540 va_end(args);
1460 1541
1461 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1542 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1462 r = vmcoreinfo_max_size - vmcoreinfo_size;
1463 1543
1464 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1544 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1465 1545
@@ -1489,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1489 VMCOREINFO_SYMBOL(swapper_pg_dir); 1569 VMCOREINFO_SYMBOL(swapper_pg_dir);
1490#endif 1570#endif
1491 VMCOREINFO_SYMBOL(_stext); 1571 VMCOREINFO_SYMBOL(_stext);
1492 VMCOREINFO_SYMBOL(vmlist); 1572 VMCOREINFO_SYMBOL(vmap_area_list);
1493 1573
1494#ifndef CONFIG_NEED_MULTIPLE_NODES 1574#ifndef CONFIG_NEED_MULTIPLE_NODES
1495 VMCOREINFO_SYMBOL(mem_map); 1575 VMCOREINFO_SYMBOL(mem_map);
@@ -1527,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1527 VMCOREINFO_OFFSET(free_area, free_list); 1607 VMCOREINFO_OFFSET(free_area, free_list);
1528 VMCOREINFO_OFFSET(list_head, next); 1608 VMCOREINFO_OFFSET(list_head, next);
1529 VMCOREINFO_OFFSET(list_head, prev); 1609 VMCOREINFO_OFFSET(list_head, prev);
1530 VMCOREINFO_OFFSET(vm_struct, addr); 1610 VMCOREINFO_OFFSET(vmap_area, va_start);
1611 VMCOREINFO_OFFSET(vmap_area, list);
1531 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1612 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1532 log_buf_kexec_setup(); 1613 log_buf_kexec_setup();
1533 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1614 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56dd34976d7b..8241906c4b61 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
77 77
78static int call_modprobe(char *module_name, int wait) 78static int call_modprobe(char *module_name, int wait)
79{ 79{
80 struct subprocess_info *info;
80 static char *envp[] = { 81 static char *envp[] = {
81 "HOME=/", 82 "HOME=/",
82 "TERM=linux", 83 "TERM=linux",
@@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait)
98 argv[3] = module_name; /* check free_modprobe_argv() */ 99 argv[3] = module_name; /* check free_modprobe_argv() */
99 argv[4] = NULL; 100 argv[4] = NULL;
100 101
101 return call_usermodehelper_fns(modprobe_path, argv, envp, 102 info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
102 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); 103 NULL, free_modprobe_argv, NULL);
104 if (!info)
105 goto free_module_name;
106
107 return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
108
109free_module_name:
110 kfree(module_name);
103free_argv: 111free_argv:
104 kfree(argv); 112 kfree(argv);
105out: 113out:
@@ -502,14 +510,28 @@ static void helper_unlock(void)
502 * @argv: arg vector for process 510 * @argv: arg vector for process
503 * @envp: environment for process 511 * @envp: environment for process
504 * @gfp_mask: gfp mask for memory allocation 512 * @gfp_mask: gfp mask for memory allocation
513 * @cleanup: a cleanup function
514 * @init: an init function
515 * @data: arbitrary context sensitive data
505 * 516 *
506 * Returns either %NULL on allocation failure, or a subprocess_info 517 * Returns either %NULL on allocation failure, or a subprocess_info
507 * structure. This should be passed to call_usermodehelper_exec to 518 * structure. This should be passed to call_usermodehelper_exec to
508 * exec the process and free the structure. 519 * exec the process and free the structure.
520 *
521 * The init function is used to customize the helper process prior to
522 * exec. A non-zero return code causes the process to error out, exit,
523 * and return the failure to the calling process
524 *
525 * The cleanup function is just before ethe subprocess_info is about to
526 * be freed. This can be used for freeing the argv and envp. The
527 * Function must be runnable in either a process context or the
528 * context in which call_usermodehelper_exec is called.
509 */ 529 */
510static
511struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, 530struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
512 char **envp, gfp_t gfp_mask) 531 char **envp, gfp_t gfp_mask,
532 int (*init)(struct subprocess_info *info, struct cred *new),
533 void (*cleanup)(struct subprocess_info *info),
534 void *data)
513{ 535{
514 struct subprocess_info *sub_info; 536 struct subprocess_info *sub_info;
515 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); 537 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -520,56 +542,38 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
520 sub_info->path = path; 542 sub_info->path = path;
521 sub_info->argv = argv; 543 sub_info->argv = argv;
522 sub_info->envp = envp; 544 sub_info->envp = envp;
545
546 sub_info->cleanup = cleanup;
547 sub_info->init = init;
548 sub_info->data = data;
523 out: 549 out:
524 return sub_info; 550 return sub_info;
525} 551}
526 552EXPORT_SYMBOL(call_usermodehelper_setup);
527/**
528 * call_usermodehelper_setfns - set a cleanup/init function
529 * @info: a subprocess_info returned by call_usermodehelper_setup
530 * @cleanup: a cleanup function
531 * @init: an init function
532 * @data: arbitrary context sensitive data
533 *
534 * The init function is used to customize the helper process prior to
535 * exec. A non-zero return code causes the process to error out, exit,
536 * and return the failure to the calling process
537 *
538 * The cleanup function is just before ethe subprocess_info is about to
539 * be freed. This can be used for freeing the argv and envp. The
540 * Function must be runnable in either a process context or the
541 * context in which call_usermodehelper_exec is called.
542 */
543static
544void call_usermodehelper_setfns(struct subprocess_info *info,
545 int (*init)(struct subprocess_info *info, struct cred *new),
546 void (*cleanup)(struct subprocess_info *info),
547 void *data)
548{
549 info->cleanup = cleanup;
550 info->init = init;
551 info->data = data;
552}
553 553
554/** 554/**
555 * call_usermodehelper_exec - start a usermode application 555 * call_usermodehelper_exec - start a usermode application
556 * @sub_info: information about the subprocessa 556 * @sub_info: information about the subprocessa
557 * @wait: wait for the application to finish and return status. 557 * @wait: wait for the application to finish and return status.
558 * when -1 don't wait at all, but you get no useful error back when 558 * when UMH_NO_WAIT don't wait at all, but you get no useful error back
559 * the program couldn't be exec'ed. This makes it safe to call 559 * when the program couldn't be exec'ed. This makes it safe to call
560 * from interrupt context. 560 * from interrupt context.
561 * 561 *
562 * Runs a user-space application. The application is started 562 * Runs a user-space application. The application is started
563 * asynchronously if wait is not set, and runs as a child of keventd. 563 * asynchronously if wait is not set, and runs as a child of keventd.
564 * (ie. it runs with full root capabilities). 564 * (ie. it runs with full root capabilities).
565 */ 565 */
566static
567int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 566int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
568{ 567{
569 DECLARE_COMPLETION_ONSTACK(done); 568 DECLARE_COMPLETION_ONSTACK(done);
570 int retval = 0; 569 int retval = 0;
571 570
572 helper_lock(); 571 helper_lock();
572 if (!sub_info->path) {
573 retval = -EINVAL;
574 goto out;
575 }
576
573 if (sub_info->path[0] == '\0') 577 if (sub_info->path[0] == '\0')
574 goto out; 578 goto out;
575 579
@@ -615,31 +619,34 @@ unlock:
615 helper_unlock(); 619 helper_unlock();
616 return retval; 620 return retval;
617} 621}
622EXPORT_SYMBOL(call_usermodehelper_exec);
618 623
619/* 624/**
620 * call_usermodehelper_fns() will not run the caller-provided cleanup function 625 * call_usermodehelper() - prepare and start a usermode application
621 * if a memory allocation failure is experienced. So the caller might need to 626 * @path: path to usermode executable
622 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform 627 * @argv: arg vector for process
623 * the necessaary cleanup within the caller. 628 * @envp: environment for process
629 * @wait: wait for the application to finish and return status.
630 * when UMH_NO_WAIT don't wait at all, but you get no useful error back
631 * when the program couldn't be exec'ed. This makes it safe to call
632 * from interrupt context.
633 *
634 * This function is the equivalent to use call_usermodehelper_setup() and
635 * call_usermodehelper_exec().
624 */ 636 */
625int call_usermodehelper_fns( 637int call_usermodehelper(char *path, char **argv, char **envp, int wait)
626 char *path, char **argv, char **envp, int wait,
627 int (*init)(struct subprocess_info *info, struct cred *new),
628 void (*cleanup)(struct subprocess_info *), void *data)
629{ 638{
630 struct subprocess_info *info; 639 struct subprocess_info *info;
631 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; 640 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
632 641
633 info = call_usermodehelper_setup(path, argv, envp, gfp_mask); 642 info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
634 643 NULL, NULL, NULL);
635 if (info == NULL) 644 if (info == NULL)
636 return -ENOMEM; 645 return -ENOMEM;
637 646
638 call_usermodehelper_setfns(info, init, cleanup, data);
639
640 return call_usermodehelper_exec(info, wait); 647 return call_usermodehelper_exec(info, wait);
641} 648}
642EXPORT_SYMBOL(call_usermodehelper_fns); 649EXPORT_SYMBOL(call_usermodehelper);
643 650
644static int proc_cap_handler(struct ctl_table *table, int write, 651static int proc_cap_handler(struct ctl_table *table, int write,
645 void __user *buffer, size_t *lenp, loff_t *ppos) 652 void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f6613..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
794} 794}
795 795
796#ifdef CONFIG_SYSCTL 796#ifdef CONFIG_SYSCTL
797/* This should be called with kprobe_mutex locked */
798static void __kprobes optimize_all_kprobes(void) 797static void __kprobes optimize_all_kprobes(void)
799{ 798{
800 struct hlist_head *head; 799 struct hlist_head *head;
801 struct kprobe *p; 800 struct kprobe *p;
802 unsigned int i; 801 unsigned int i;
803 802
803 mutex_lock(&kprobe_mutex);
804 /* If optimization is already allowed, just return */ 804 /* If optimization is already allowed, just return */
805 if (kprobes_allow_optimization) 805 if (kprobes_allow_optimization)
806 return; 806 goto out;
807 807
808 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
813 optimize_kprobe(p); 813 optimize_kprobe(p);
814 } 814 }
815 printk(KERN_INFO "Kprobes globally optimized\n"); 815 printk(KERN_INFO "Kprobes globally optimized\n");
816out:
817 mutex_unlock(&kprobe_mutex);
816} 818}
817 819
818/* This should be called with kprobe_mutex locked */
819static void __kprobes unoptimize_all_kprobes(void) 820static void __kprobes unoptimize_all_kprobes(void)
820{ 821{
821 struct hlist_head *head; 822 struct hlist_head *head;
822 struct kprobe *p; 823 struct kprobe *p;
823 unsigned int i; 824 unsigned int i;
824 825
826 mutex_lock(&kprobe_mutex);
825 /* If optimization is already prohibited, just return */ 827 /* If optimization is already prohibited, just return */
826 if (!kprobes_allow_optimization) 828 if (!kprobes_allow_optimization) {
829 mutex_unlock(&kprobe_mutex);
827 return; 830 return;
831 }
828 832
829 kprobes_allow_optimization = false; 833 kprobes_allow_optimization = false;
830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 834 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
834 unoptimize_kprobe(p, false); 838 unoptimize_kprobe(p, false);
835 } 839 }
836 } 840 }
841 mutex_unlock(&kprobe_mutex);
842
837 /* Wait for unoptimizing completion */ 843 /* Wait for unoptimizing completion */
838 wait_for_kprobe_optimizer(); 844 wait_for_kprobe_optimizer();
839 printk(KERN_INFO "Kprobes globally unoptimized\n"); 845 printk(KERN_INFO "Kprobes globally unoptimized\n");
840} 846}
841 847
848static DEFINE_MUTEX(kprobe_sysctl_mutex);
842int sysctl_kprobes_optimization; 849int sysctl_kprobes_optimization;
843int proc_kprobes_optimization_handler(struct ctl_table *table, int write, 850int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
844 void __user *buffer, size_t *length, 851 void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
846{ 853{
847 int ret; 854 int ret;
848 855
849 mutex_lock(&kprobe_mutex); 856 mutex_lock(&kprobe_sysctl_mutex);
850 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; 857 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
851 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 858 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
852 859
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
854 optimize_all_kprobes(); 861 optimize_all_kprobes();
855 else 862 else
856 unoptimize_all_kprobes(); 863 unoptimize_all_kprobes();
857 mutex_unlock(&kprobe_mutex); 864 mutex_unlock(&kprobe_sysctl_mutex);
858 865
859 return ret; 866 return ret;
860} 867}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..760e86df8c20 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/ptrace.h> 19#include <linux/ptrace.h>
20#include <linux/uaccess.h>
20#include <trace/events/sched.h> 21#include <trace/events/sched.h>
21 22
22static DEFINE_SPINLOCK(kthread_create_lock); 23static DEFINE_SPINLOCK(kthread_create_lock);
@@ -52,8 +53,21 @@ enum KTHREAD_BITS {
52 KTHREAD_IS_PARKED, 53 KTHREAD_IS_PARKED,
53}; 54};
54 55
55#define to_kthread(tsk) \ 56#define __to_kthread(vfork) \
56 container_of((tsk)->vfork_done, struct kthread, exited) 57 container_of(vfork, struct kthread, exited)
58
59static inline struct kthread *to_kthread(struct task_struct *k)
60{
61 return __to_kthread(k->vfork_done);
62}
63
64static struct kthread *to_live_kthread(struct task_struct *k)
65{
66 struct completion *vfork = ACCESS_ONCE(k->vfork_done);
67 if (likely(vfork))
68 return __to_kthread(vfork);
69 return NULL;
70}
57 71
58/** 72/**
59 * kthread_should_stop - should this kthread return now? 73 * kthread_should_stop - should this kthread return now?
@@ -122,14 +136,32 @@ void *kthread_data(struct task_struct *task)
122 return to_kthread(task)->data; 136 return to_kthread(task)->data;
123} 137}
124 138
139/**
140 * probe_kthread_data - speculative version of kthread_data()
141 * @task: possible kthread task in question
142 *
143 * @task could be a kthread task. Return the data value specified when it
144 * was created if accessible. If @task isn't a kthread task or its data is
145 * inaccessible for any reason, %NULL is returned. This function requires
146 * that @task itself is safe to dereference.
147 */
148void *probe_kthread_data(struct task_struct *task)
149{
150 struct kthread *kthread = to_kthread(task);
151 void *data = NULL;
152
153 probe_kernel_read(&data, &kthread->data, sizeof(data));
154 return data;
155}
156
125static void __kthread_parkme(struct kthread *self) 157static void __kthread_parkme(struct kthread *self)
126{ 158{
127 __set_current_state(TASK_INTERRUPTIBLE); 159 __set_current_state(TASK_PARKED);
128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { 160 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) 161 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 complete(&self->parked); 162 complete(&self->parked);
131 schedule(); 163 schedule();
132 __set_current_state(TASK_INTERRUPTIBLE); 164 __set_current_state(TASK_PARKED);
133 } 165 }
134 clear_bit(KTHREAD_IS_PARKED, &self->flags); 166 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 __set_current_state(TASK_RUNNING); 167 __set_current_state(TASK_RUNNING);
@@ -256,11 +288,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
256} 288}
257EXPORT_SYMBOL(kthread_create_on_node); 289EXPORT_SYMBOL(kthread_create_on_node);
258 290
259static void __kthread_bind(struct task_struct *p, unsigned int cpu) 291static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
260{ 292{
293 /* Must have done schedule() in kthread() before we set_task_cpu */
294 if (!wait_task_inactive(p, state)) {
295 WARN_ON(1);
296 return;
297 }
261 /* It's safe because the task is inactive. */ 298 /* It's safe because the task is inactive. */
262 do_set_cpus_allowed(p, cpumask_of(cpu)); 299 do_set_cpus_allowed(p, cpumask_of(cpu));
263 p->flags |= PF_THREAD_BOUND; 300 p->flags |= PF_NO_SETAFFINITY;
264} 301}
265 302
266/** 303/**
@@ -274,12 +311,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
274 */ 311 */
275void kthread_bind(struct task_struct *p, unsigned int cpu) 312void kthread_bind(struct task_struct *p, unsigned int cpu)
276{ 313{
277 /* Must have done schedule() in kthread() before we set_task_cpu */ 314 __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
278 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
279 WARN_ON(1);
280 return;
281 }
282 __kthread_bind(p, cpu);
283} 315}
284EXPORT_SYMBOL(kthread_bind); 316EXPORT_SYMBOL(kthread_bind);
285 317
@@ -311,17 +343,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
311 return p; 343 return p;
312} 344}
313 345
314static struct kthread *task_get_live_kthread(struct task_struct *k) 346static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
315{ 347{
316 struct kthread *kthread; 348 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
317 349 /*
318 get_task_struct(k); 350 * We clear the IS_PARKED bit here as we don't wait
319 kthread = to_kthread(k); 351 * until the task has left the park code. So if we'd
320 /* It might have exited */ 352 * park before that happens we'd see the IS_PARKED bit
321 barrier(); 353 * which might be about to be cleared.
322 if (k->vfork_done != NULL) 354 */
323 return kthread; 355 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
324 return NULL; 356 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
357 __kthread_bind(k, kthread->cpu, TASK_PARKED);
358 wake_up_state(k, TASK_PARKED);
359 }
325} 360}
326 361
327/** 362/**
@@ -334,23 +369,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
334 */ 369 */
335void kthread_unpark(struct task_struct *k) 370void kthread_unpark(struct task_struct *k)
336{ 371{
337 struct kthread *kthread = task_get_live_kthread(k); 372 struct kthread *kthread = to_live_kthread(k);
338 373
339 if (kthread) { 374 if (kthread)
340 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 375 __kthread_unpark(k, kthread);
341 /*
342 * We clear the IS_PARKED bit here as we don't wait
343 * until the task has left the park code. So if we'd
344 * park before that happens we'd see the IS_PARKED bit
345 * which might be about to be cleared.
346 */
347 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349 __kthread_bind(k, kthread->cpu);
350 wake_up_process(k);
351 }
352 }
353 put_task_struct(k);
354} 376}
355 377
356/** 378/**
@@ -367,7 +389,7 @@ void kthread_unpark(struct task_struct *k)
367 */ 389 */
368int kthread_park(struct task_struct *k) 390int kthread_park(struct task_struct *k)
369{ 391{
370 struct kthread *kthread = task_get_live_kthread(k); 392 struct kthread *kthread = to_live_kthread(k);
371 int ret = -ENOSYS; 393 int ret = -ENOSYS;
372 394
373 if (kthread) { 395 if (kthread) {
@@ -380,7 +402,6 @@ int kthread_park(struct task_struct *k)
380 } 402 }
381 ret = 0; 403 ret = 0;
382 } 404 }
383 put_task_struct(k);
384 return ret; 405 return ret;
385} 406}
386 407
@@ -401,21 +422,23 @@ int kthread_park(struct task_struct *k)
401 */ 422 */
402int kthread_stop(struct task_struct *k) 423int kthread_stop(struct task_struct *k)
403{ 424{
404 struct kthread *kthread = task_get_live_kthread(k); 425 struct kthread *kthread;
405 int ret; 426 int ret;
406 427
407 trace_sched_kthread_stop(k); 428 trace_sched_kthread_stop(k);
429
430 get_task_struct(k);
431 kthread = to_live_kthread(k);
408 if (kthread) { 432 if (kthread) {
409 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 433 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
410 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 434 __kthread_unpark(k, kthread);
411 wake_up_process(k); 435 wake_up_process(k);
412 wait_for_completion(&kthread->exited); 436 wait_for_completion(&kthread->exited);
413 } 437 }
414 ret = k->exit_code; 438 ret = k->exit_code;
415
416 put_task_struct(k); 439 put_task_struct(k);
417 trace_sched_kthread_stop_ret(ret);
418 440
441 trace_sched_kthread_stop_ret(ret);
419 return ret; 442 return ret;
420} 443}
421EXPORT_SYMBOL(kthread_stop); 444EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 259db207b5d9..1f3186b37fd5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
380unsigned long nr_stack_trace_entries; 380unsigned long nr_stack_trace_entries;
381static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; 381static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
382 382
383static void print_lockdep_off(const char *bug_msg)
384{
385 printk(KERN_DEBUG "%s\n", bug_msg);
386 printk(KERN_DEBUG "turning off the locking correctness validator.\n");
387 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
388}
389
383static int save_trace(struct stack_trace *trace) 390static int save_trace(struct stack_trace *trace)
384{ 391{
385 trace->nr_entries = 0; 392 trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
409 if (!debug_locks_off_graph_unlock()) 416 if (!debug_locks_off_graph_unlock())
410 return 0; 417 return 0;
411 418
412 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); 419 print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
413 printk("turning off the locking correctness validator.\n");
414 dump_stack(); 420 dump_stack();
415 421
416 return 0; 422 return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
763 } 769 }
764 raw_local_irq_restore(flags); 770 raw_local_irq_restore(flags);
765 771
766 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 772 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
767 printk("turning off the locking correctness validator.\n");
768 dump_stack(); 773 dump_stack();
769 return NULL; 774 return NULL;
770 } 775 }
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
834 if (!debug_locks_off_graph_unlock()) 839 if (!debug_locks_off_graph_unlock())
835 return NULL; 840 return NULL;
836 841
837 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 842 print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
838 printk("turning off the locking correctness validator.\n");
839 dump_stack(); 843 dump_stack();
840 return NULL; 844 return NULL;
841 } 845 }
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2000 struct lock_class *class = hlock_class(hlock); 2004 struct lock_class *class = hlock_class(hlock);
2001 struct list_head *hash_head = chainhashentry(chain_key); 2005 struct list_head *hash_head = chainhashentry(chain_key);
2002 struct lock_chain *chain; 2006 struct lock_chain *chain;
2003 struct held_lock *hlock_curr, *hlock_next; 2007 struct held_lock *hlock_curr;
2004 int i, j; 2008 int i, j;
2005 2009
2006 /* 2010 /*
@@ -2048,8 +2052,7 @@ cache_hit:
2048 if (!debug_locks_off_graph_unlock()) 2052 if (!debug_locks_off_graph_unlock())
2049 return 0; 2053 return 0;
2050 2054
2051 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 2055 print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
2052 printk("turning off the locking correctness validator.\n");
2053 dump_stack(); 2056 dump_stack();
2054 return 0; 2057 return 0;
2055 } 2058 }
@@ -2057,12 +2060,10 @@ cache_hit:
2057 chain->chain_key = chain_key; 2060 chain->chain_key = chain_key;
2058 chain->irq_context = hlock->irq_context; 2061 chain->irq_context = hlock->irq_context;
2059 /* Find the first held_lock of current chain */ 2062 /* Find the first held_lock of current chain */
2060 hlock_next = hlock;
2061 for (i = curr->lockdep_depth - 1; i >= 0; i--) { 2063 for (i = curr->lockdep_depth - 1; i >= 0; i--) {
2062 hlock_curr = curr->held_locks + i; 2064 hlock_curr = curr->held_locks + i;
2063 if (hlock_curr->irq_context != hlock_next->irq_context) 2065 if (hlock_curr->irq_context != hlock->irq_context)
2064 break; 2066 break;
2065 hlock_next = hlock;
2066 } 2067 }
2067 i++; 2068 i++;
2068 chain->depth = curr->lockdep_depth + 1 - i; 2069 chain->depth = curr->lockdep_depth + 1 - i;
@@ -2997,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2997EXPORT_SYMBOL_GPL(lockdep_init_map); 2998EXPORT_SYMBOL_GPL(lockdep_init_map);
2998 2999
2999struct lock_class_key __lockdep_no_validate__; 3000struct lock_class_key __lockdep_no_validate__;
3001EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
3000 3002
3001static int 3003static int
3002print_lock_nested_lock_not_held(struct task_struct *curr, 3004print_lock_nested_lock_not_held(struct task_struct *curr,
@@ -3190,9 +3192,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3192#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3193 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3194 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", 3195 print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
3196 printk(KERN_DEBUG "depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH); 3197 curr->lockdep_depth, MAX_LOCK_DEPTH);
3195 printk("turning off the locking correctness validator.\n");
3196 3198
3197 lockdep_print_held_locks(current); 3199 lockdep_print_held_locks(current);
3198 debug_show_all_locks(); 3200 debug_show_all_locks();
@@ -4088,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4088} 4090}
4089EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4090 4092
4091static void print_held_locks_bug(void) 4093static void print_held_locks_bug(struct task_struct *curr)
4092{ 4094{
4093 if (!debug_locks_off()) 4095 if (!debug_locks_off())
4094 return; 4096 return;
@@ -4097,21 +4099,22 @@ static void print_held_locks_bug(void)
4097 4099
4098 printk("\n"); 4100 printk("\n");
4099 printk("=====================================\n"); 4101 printk("=====================================\n");
4100 printk("[ BUG: %s/%d still has locks held! ]\n", 4102 printk("[ BUG: lock held at task exit time! ]\n");
4101 current->comm, task_pid_nr(current));
4102 print_kernel_ident(); 4103 print_kernel_ident();
4103 printk("-------------------------------------\n"); 4104 printk("-------------------------------------\n");
4104 lockdep_print_held_locks(current); 4105 printk("%s/%d is exiting with locks still held!\n",
4106 curr->comm, task_pid_nr(curr));
4107 lockdep_print_held_locks(curr);
4108
4105 printk("\nstack backtrace:\n"); 4109 printk("\nstack backtrace:\n");
4106 dump_stack(); 4110 dump_stack();
4107} 4111}
4108 4112
4109void debug_check_no_locks_held(void) 4113void debug_check_no_locks_held(struct task_struct *task)
4110{ 4114{
4111 if (unlikely(current->lockdep_depth > 0)) 4115 if (unlikely(task->lockdep_depth > 0))
4112 print_held_locks_bug(); 4116 print_held_locks_bug(task);
4113} 4117}
4114EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4115 4118
4116void debug_show_all_locks(void) 4119void debug_show_all_locks(void)
4117{ 4120{
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e6135..4a9a86d12c8b 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ 1#include <linux/export.h>
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9 2
10#define GLOBAL(name) \ 3#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \ 4 .globl VMLINUX_SYMBOL(name); \
12 ASM_SYMBOL(name): 5 VMLINUX_SYMBOL(name):
13 6
14 .section ".init.data","aw" 7 .section ".init.data","aw"
15 8
diff --git a/kernel/module.c b/kernel/module.c
index 0925c9a71975..cab4bce49c23 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1209 1209
1210 /* Since this should be found in kernel (which can't be removed), 1210 /* Since this should be found in kernel (which can't be removed),
1211 * no locking is necessary. */ 1211 * no locking is necessary. */
1212 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1212 if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
1213 &crc, true, false)) 1213 &crc, true, false))
1214 BUG(); 1214 BUG();
1215 return check_version(sechdrs, versindex, "module_layout", mod, crc, 1215 return check_version(sechdrs, versindex,
1216 VMLINUX_SYMBOL_STR(module_layout), mod, crc,
1216 NULL); 1217 NULL);
1217} 1218}
1218 1219
@@ -1861,12 +1862,12 @@ static void free_module(struct module *mod)
1861{ 1862{
1862 trace_module_free(mod); 1863 trace_module_free(mod);
1863 1864
1864 /* Delete from various lists */
1865 mutex_lock(&module_mutex);
1866 stop_machine(__unlink_module, mod, NULL);
1867 mutex_unlock(&module_mutex);
1868 mod_sysfs_teardown(mod); 1865 mod_sysfs_teardown(mod);
1869 1866
1867 /* We leave it in list to prevent duplicate loads, but make sure
1868 * that noone uses it while it's being deconstructed. */
1869 mod->state = MODULE_STATE_UNFORMED;
1870
1870 /* Remove dynamic debug info */ 1871 /* Remove dynamic debug info */
1871 ddebug_remove_module(mod->name); 1872 ddebug_remove_module(mod->name);
1872 1873
@@ -1879,6 +1880,11 @@ static void free_module(struct module *mod)
1879 /* Free any allocated parameters. */ 1880 /* Free any allocated parameters. */
1880 destroy_params(mod->kp, mod->num_kp); 1881 destroy_params(mod->kp, mod->num_kp);
1881 1882
1883 /* Now we can delete it from the lists */
1884 mutex_lock(&module_mutex);
1885 stop_machine(__unlink_module, mod, NULL);
1886 mutex_unlock(&module_mutex);
1887
1882 /* This may be NULL, but that's OK */ 1888 /* This may be NULL, but that's OK */
1883 unset_module_init_ro_nx(mod); 1889 unset_module_init_ro_nx(mod);
1884 module_free(mod, mod->module_init); 1890 module_free(mod, mod->module_init);
@@ -2425,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod,
2425 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2431 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2426 2432
2427 for (i = 1; i < info->hdr->e_shnum; i++) { 2433 for (i = 1; i < info->hdr->e_shnum; i++) {
2428 const char *name = info->secstrings + info->sechdrs[i].sh_name; 2434 /* Scan all writable sections that's not executable */
2429 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) 2435 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
2430 continue; 2436 !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
2431 if (!strstarts(name, ".data") && !strstarts(name, ".bss")) 2437 (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
2432 continue; 2438 continue;
2433 2439
2434 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, 2440 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
@@ -2763,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2763 mod->trace_events = section_objs(info, "_ftrace_events", 2769 mod->trace_events = section_objs(info, "_ftrace_events",
2764 sizeof(*mod->trace_events), 2770 sizeof(*mod->trace_events),
2765 &mod->num_trace_events); 2771 &mod->num_trace_events);
2766 /*
2767 * This section contains pointers to allocated objects in the trace
2768 * code and not scanning it leads to false positives.
2769 */
2770 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2771 mod->num_trace_events, GFP_KERNEL);
2772#endif 2772#endif
2773#ifdef CONFIG_TRACING 2773#ifdef CONFIG_TRACING
2774 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", 2774 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2775 sizeof(*mod->trace_bprintk_fmt_start), 2775 sizeof(*mod->trace_bprintk_fmt_start),
2776 &mod->num_trace_bprintk_fmt); 2776 &mod->num_trace_bprintk_fmt);
2777 /*
2778 * This section contains pointers to allocated objects in the trace
2779 * code and not scanning it leads to false positives.
2780 */
2781 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2782 sizeof(*mod->trace_bprintk_fmt_start) *
2783 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2784#endif 2777#endif
2785#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2778#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2786 /* sechdrs[0].sh_size is always zero */ 2779 /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
37# include <asm/mutex.h> 37# include <asm/mutex.h>
38#endif 38#endif
39 39
40/*
41 * A negative mutex count indicates that waiters are sleeping waiting for the
42 * mutex.
43 */
44#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
45
40void 46void
41__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 47__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
42{ 48{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
44 spin_lock_init(&lock->wait_lock); 50 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list); 51 INIT_LIST_HEAD(&lock->wait_list);
46 mutex_clear_owner(lock); 52 mutex_clear_owner(lock);
53#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
54 lock->spin_mlock = NULL;
55#endif
47 56
48 debug_mutex_init(lock, name, key); 57 debug_mutex_init(lock, name, key);
49} 58}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
95EXPORT_SYMBOL(mutex_lock); 104EXPORT_SYMBOL(mutex_lock);
96#endif 105#endif
97 106
107#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
108/*
109 * In order to avoid a stampede of mutex spinners from acquiring the mutex
110 * more or less simultaneously, the spinners need to acquire a MCS lock
111 * first before spinning on the owner field.
112 *
113 * We don't inline mspin_lock() so that perf can correctly account for the
114 * time spent in this lock function.
115 */
116struct mspin_node {
117 struct mspin_node *next ;
118 int locked; /* 1 if lock acquired */
119};
120#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
121
122static noinline
123void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
124{
125 struct mspin_node *prev;
126
127 /* Init node */
128 node->locked = 0;
129 node->next = NULL;
130
131 prev = xchg(lock, node);
132 if (likely(prev == NULL)) {
133 /* Lock acquired */
134 node->locked = 1;
135 return;
136 }
137 ACCESS_ONCE(prev->next) = node;
138 smp_wmb();
139 /* Wait until the lock holder passes the lock down */
140 while (!ACCESS_ONCE(node->locked))
141 arch_mutex_cpu_relax();
142}
143
144static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
145{
146 struct mspin_node *next = ACCESS_ONCE(node->next);
147
148 if (likely(!next)) {
149 /*
150 * Release the lock by setting it to NULL
151 */
152 if (cmpxchg(lock, node, NULL) == node)
153 return;
154 /* Wait until the next pointer is set */
155 while (!(next = ACCESS_ONCE(node->next)))
156 arch_mutex_cpu_relax();
157 }
158 ACCESS_ONCE(next->locked) = 1;
159 smp_wmb();
160}
161
162/*
163 * Mutex spinning code migrated from kernel/sched/core.c
164 */
165
166static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
167{
168 if (lock->owner != owner)
169 return false;
170
171 /*
172 * Ensure we emit the owner->on_cpu, dereference _after_ checking
173 * lock->owner still matches owner, if that fails, owner might
174 * point to free()d memory, if it still matches, the rcu_read_lock()
175 * ensures the memory stays valid.
176 */
177 barrier();
178
179 return owner->on_cpu;
180}
181
182/*
183 * Look out! "owner" is an entirely speculative pointer
184 * access and not reliable.
185 */
186static noinline
187int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
188{
189 rcu_read_lock();
190 while (owner_running(lock, owner)) {
191 if (need_resched())
192 break;
193
194 arch_mutex_cpu_relax();
195 }
196 rcu_read_unlock();
197
198 /*
199 * We break out the loop above on need_resched() and when the
200 * owner changed, which is a sign for heavy contention. Return
201 * success only when lock->owner is NULL.
202 */
203 return lock->owner == NULL;
204}
205
206/*
207 * Initial check for entering the mutex spinning loop
208 */
209static inline int mutex_can_spin_on_owner(struct mutex *lock)
210{
211 int retval = 1;
212
213 rcu_read_lock();
214 if (lock->owner)
215 retval = lock->owner->on_cpu;
216 rcu_read_unlock();
217 /*
218 * if lock->owner is not set, the mutex owner may have just acquired
219 * it and not set the owner yet or the mutex has been released.
220 */
221 return retval;
222}
223#endif
224
98static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 225static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
99 226
100/** 227/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
158 * 285 *
159 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock 286 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
160 * to serialize everything. 287 * to serialize everything.
288 *
289 * The mutex spinners are queued up using MCS lock so that only one
290 * spinner can compete for the mutex. However, if mutex spinning isn't
291 * going to happen, there is no point in going through the lock/unlock
292 * overhead.
161 */ 293 */
294 if (!mutex_can_spin_on_owner(lock))
295 goto slowpath;
162 296
163 for (;;) { 297 for (;;) {
164 struct task_struct *owner; 298 struct task_struct *owner;
299 struct mspin_node node;
165 300
166 /* 301 /*
167 * If there's an owner, wait for it to either 302 * If there's an owner, wait for it to either
168 * release the lock or go to sleep. 303 * release the lock or go to sleep.
169 */ 304 */
305 mspin_lock(MLOCK(lock), &node);
170 owner = ACCESS_ONCE(lock->owner); 306 owner = ACCESS_ONCE(lock->owner);
171 if (owner && !mutex_spin_on_owner(lock, owner)) 307 if (owner && !mutex_spin_on_owner(lock, owner)) {
308 mspin_unlock(MLOCK(lock), &node);
172 break; 309 break;
310 }
173 311
174 if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { 312 if ((atomic_read(&lock->count) == 1) &&
313 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
175 lock_acquired(&lock->dep_map, ip); 314 lock_acquired(&lock->dep_map, ip);
176 mutex_set_owner(lock); 315 mutex_set_owner(lock);
316 mspin_unlock(MLOCK(lock), &node);
177 preempt_enable(); 317 preempt_enable();
178 return 0; 318 return 0;
179 } 319 }
320 mspin_unlock(MLOCK(lock), &node);
180 321
181 /* 322 /*
182 * When there's no owner, we might have preempted between the 323 * When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
195 */ 336 */
196 arch_mutex_cpu_relax(); 337 arch_mutex_cpu_relax();
197 } 338 }
339slowpath:
198#endif 340#endif
199 spin_lock_mutex(&lock->wait_lock, flags); 341 spin_lock_mutex(&lock->wait_lock, flags);
200 342
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
205 list_add_tail(&waiter.list, &lock->wait_list); 347 list_add_tail(&waiter.list, &lock->wait_list);
206 waiter.task = task; 348 waiter.task = task;
207 349
208 if (atomic_xchg(&lock->count, -1) == 1) 350 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
209 goto done; 351 goto done;
210 352
211 lock_contended(&lock->dep_map, ip); 353 lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
220 * that when we release the lock, we properly wake up the 362 * that when we release the lock, we properly wake up the
221 * other waiters: 363 * other waiters:
222 */ 364 */
223 if (atomic_xchg(&lock->count, -1) == 1) 365 if (MUTEX_SHOW_NO_WAITER(lock) &&
366 (atomic_xchg(&lock->count, -1) == 1))
224 break; 367 break;
225 368
226 /* 369 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456f227a..364ceab15f0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,7 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h> 25#include <linux/proc_ns.h>
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/syscalls.h> 27#include <linux/syscalls.h>
28 28
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 const struct proc_ns_operations *ops; 241 const struct proc_ns_operations *ops;
242 struct task_struct *tsk = current; 242 struct task_struct *tsk = current;
243 struct nsproxy *new_nsproxy; 243 struct nsproxy *new_nsproxy;
244 struct proc_inode *ei; 244 struct proc_ns *ei;
245 struct file *file; 245 struct file *file;
246 int err; 246 int err;
247 247
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
250 return PTR_ERR(file); 250 return PTR_ERR(file);
251 251
252 err = -EINVAL; 252 err = -EINVAL;
253 ei = PROC_I(file_inode(file)); 253 ei = get_proc_ns(file_inode(file));
254 ops = ei->ns_ops; 254 ops = ei->ns_ops;
255 if (nstype && (ops->type != nstype)) 255 if (nstype && (ops->type != nstype))
256 goto out; 256 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index 7c57cc9eee2c..167ec097ce8b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
22#include <linux/sysrq.h> 22#include <linux/sysrq.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h>
26 25
27#define PANIC_TIMER_STEP 100 26#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 27#define PANIC_BLINK_SPD 18
@@ -400,13 +399,8 @@ struct slowpath_args {
400static void warn_slowpath_common(const char *file, int line, void *caller, 399static void warn_slowpath_common(const char *file, int line, void *caller,
401 unsigned taint, struct slowpath_args *args) 400 unsigned taint, struct slowpath_args *args)
402{ 401{
403 const char *board;
404
405 printk(KERN_WARNING "------------[ cut here ]------------\n"); 402 printk(KERN_WARNING "------------[ cut here ]------------\n");
406 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); 403 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
407 board = dmi_get_system_info(DMI_PRODUCT_NAME);
408 if (board)
409 printk(KERN_WARNING "Hardware name: %s\n", board);
410 404
411 if (args) 405 if (args)
412 vprintk(args->fmt, args->args); 406 vprintk(args->fmt, args->args);
diff --git a/kernel/params.c b/kernel/params.c
index ed35345be536..53b958fcd639 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
613 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), 613 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
614 GFP_KERNEL); 614 GFP_KERNEL);
615 if (!new) { 615 if (!new) {
616 kfree(mk->mp); 616 kfree(attrs);
617 err = -ENOMEM; 617 err = -ENOMEM;
618 goto fail; 618 goto fail;
619 } 619 }
620 /* Despite looking like the typical realloc() bug, this is safe.
621 * We *want* the old 'attrs' to be freed either way, and we'll store
622 * the new one in the success case. */
620 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); 623 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
621 if (!attrs) { 624 if (!attrs) {
622 err = -ENOMEM; 625 err = -ENOMEM;
diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc6264638..0db3e791a06d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_ns.h>
39#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
40 41
41#define pid_hashfn(nr, ns) \ 42#define pid_hashfn(nr, ns) \
@@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT;
51int pid_max_min = RESERVED_PIDS + 1; 52int pid_max_min = RESERVED_PIDS + 1;
52int pid_max_max = PID_MAX_LIMIT; 53int pid_max_max = PID_MAX_LIMIT;
53 54
54#define BITS_PER_PAGE (PAGE_SIZE*8)
55#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
56
57static inline int mk_pid(struct pid_namespace *pid_ns, 55static inline int mk_pid(struct pid_namespace *pid_ns,
58 struct pidmap *map, int off) 56 struct pidmap *map, int off)
59{ 57{
@@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
183 break; 181 break;
184 } 182 }
185 if (likely(atomic_read(&map->nr_free))) { 183 if (likely(atomic_read(&map->nr_free))) {
186 do { 184 for ( ; ; ) {
187 if (!test_and_set_bit(offset, map->page)) { 185 if (!test_and_set_bit(offset, map->page)) {
188 atomic_dec(&map->nr_free); 186 atomic_dec(&map->nr_free);
189 set_last_pid(pid_ns, last, pid); 187 set_last_pid(pid_ns, last, pid);
190 return pid; 188 return pid;
191 } 189 }
192 offset = find_next_offset(map, offset); 190 offset = find_next_offset(map, offset);
191 if (offset >= BITS_PER_PAGE)
192 break;
193 pid = mk_pid(pid_ns, map, offset); 193 pid = mk_pid(pid_ns, map, offset);
194 } while (offset < BITS_PER_PAGE && pid < pid_max); 194 if (pid >= pid_max)
195 break;
196 }
195 } 197 }
196 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 198 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
197 ++map; 199 ++map;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c6023..6917e8edb48e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,12 +15,10 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/acct.h> 16#include <linux/acct.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_ns.h>
19#include <linux/reboot.h> 19#include <linux/reboot.h>
20#include <linux/export.h> 20#include <linux/export.h>
21 21
22#define BITS_PER_PAGE (PAGE_SIZE*8)
23
24struct pid_cache { 22struct pid_cache {
25 int nr_ids; 23 int nr_ids;
26 char name[16]; 24 char name[16];
@@ -181,6 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
181 int nr; 179 int nr;
182 int rc; 180 int rc;
183 struct task_struct *task, *me = current; 181 struct task_struct *task, *me = current;
182 int init_pids = thread_group_leader(me) ? 1 : 2;
184 183
185 /* Don't allow any more processes into the pid namespace */ 184 /* Don't allow any more processes into the pid namespace */
186 disable_pid_allocation(pid_ns); 185 disable_pid_allocation(pid_ns);
@@ -230,7 +229,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
230 */ 229 */
231 for (;;) { 230 for (;;) {
232 set_current_state(TASK_UNINTERRUPTIBLE); 231 set_current_state(TASK_UNINTERRUPTIBLE);
233 if (pid_ns->nr_hashed == 1) 232 if (pid_ns->nr_hashed == init_pids)
234 break; 233 break;
235 schedule(); 234 schedule();
236 } 235 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8fd709c9bb58..42670e9b44e0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/tick.h>
14#include <linux/workqueue.h>
13 15
14/* 16/*
15 * Called after updating RLIMIT_CPU to run cpu timer and update 17 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
153 } 155 }
154} 156}
155 157
158/**
159 * task_cputime_zero - Check a task_cputime struct for all zero fields.
160 *
161 * @cputime: The struct to compare.
162 *
163 * Checks @cputime to see if all fields are zero. Returns true if all fields
164 * are zero, false if any field is nonzero.
165 */
166static inline int task_cputime_zero(const struct task_cputime *cputime)
167{
168 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
169 return 1;
170 return 0;
171}
172
156static inline cputime_t prof_ticks(struct task_struct *p) 173static inline cputime_t prof_ticks(struct task_struct *p)
157{ 174{
158 cputime_t utime, stime; 175 cputime_t utime, stime;
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
636 return 0; 653 return 0;
637} 654}
638 655
656#ifdef CONFIG_NO_HZ_FULL
657static void nohz_kick_work_fn(struct work_struct *work)
658{
659 tick_nohz_full_kick_all();
660}
661
662static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
663
664/*
665 * We need the IPIs to be sent from sane process context.
666 * The posix cpu timers are always set with irqs disabled.
667 */
668static void posix_cpu_timer_kick_nohz(void)
669{
670 schedule_work(&nohz_kick_work);
671}
672
673bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
674{
675 if (!task_cputime_zero(&tsk->cputime_expires))
676 return false;
677
678 if (tsk->signal->cputimer.running)
679 return false;
680
681 return true;
682}
683#else
684static inline void posix_cpu_timer_kick_nohz(void) { }
685#endif
686
639/* 687/*
640 * Guts of sys_timer_settime for CPU timers. 688 * Guts of sys_timer_settime for CPU timers.
641 * This is called with the timer locked and interrupts disabled. 689 * This is called with the timer locked and interrupts disabled.
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
794 sample_to_timespec(timer->it_clock, 842 sample_to_timespec(timer->it_clock,
795 old_incr, &old->it_interval); 843 old_incr, &old->it_interval);
796 } 844 }
845 if (!ret)
846 posix_cpu_timer_kick_nohz();
797 return ret; 847 return ret;
798} 848}
799 849
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1008 } 1058 }
1009} 1059}
1010 1060
1011/**
1012 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1013 *
1014 * @cputime: The struct to compare.
1015 *
1016 * Checks @cputime to see if all fields are zero. Returns true if all fields
1017 * are zero, false if any field is nonzero.
1018 */
1019static inline int task_cputime_zero(const struct task_cputime *cputime)
1020{
1021 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1022 return 1;
1023 return 0;
1024}
1025
1026/* 1061/*
1027 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1028 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1336 cpu_timer_fire(timer); 1371 cpu_timer_fire(timer);
1337 spin_unlock(&timer->it_lock); 1372 spin_unlock(&timer->it_lock);
1338 } 1373 }
1374
1375 /*
1376 * In case some timers were rescheduled after the queue got emptied,
1377 * wake up full dynticks CPUs.
1378 */
1379 if (tsk->signal->cputimer.running)
1380 posix_cpu_timer_kick_nohz();
1339} 1381}
1340 1382
1341/* 1383/*
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1366 } 1408 }
1367 1409
1368 if (!*newval) 1410 if (!*newval)
1369 return; 1411 goto out;
1370 *newval += now.cpu; 1412 *newval += now.cpu;
1371 } 1413 }
1372 1414
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1384 tsk->signal->cputime_expires.virt_exp = *newval; 1426 tsk->signal->cputime_expires.virt_exp = *newval;
1385 break; 1427 break;
1386 } 1428 }
1429out:
1430 posix_cpu_timer_kick_nohz();
1387} 1431}
1388 1432
1389static int do_cpu_nanosleep(const clockid_t which_clock, int flags, 1433static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c22..424c2d4265c9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
40#include <linux/list.h> 40#include <linux/list.h>
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/hash.h>
44#include <linux/posix-clock.h> 44#include <linux/posix-clock.h>
45#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/export.h> 49#include <linux/export.h>
50#include <linux/hashtable.h>
50 51
51/* 52/*
52 * Management arrays for POSIX timers. Timers are kept in slab memory 53 * Management arrays for POSIX timers. Timers are now kept in static hash table
53 * Timer ids are allocated by an external routine that keeps track of the 54 * with 512 entries.
54 * id and the timer. The external interface is: 55 * Timer ids are allocated by local routine, which selects proper hash head by
55 * 56 * key, constructed from current->signal address and per signal struct counter.
56 * void *idr_find(struct idr *idp, int id); to find timer_id <id> 57 * This keeps timer ids unique per process, but now they can intersect between
57 * int idr_get_new(struct idr *idp, void *ptr); to get a new id and 58 * processes.
58 * related it to <ptr>
59 * void idr_remove(struct idr *idp, int id); to release <id>
60 * void idr_init(struct idr *idp); to initialize <idp>
61 * which we supply.
62 * The idr_get_new *may* call slab for more memory so it must not be
63 * called under a spin lock. Likewise idr_remore may release memory
64 * (but it may be ok to do this under a lock...).
65 * idr_find is just a memory look up and is quite fast. A -1 return
66 * indicates that the requested id does not exist.
67 */ 59 */
68 60
69/* 61/*
70 * Lets keep our timers in a slab cache :-) 62 * Lets keep our timers in a slab cache :-)
71 */ 63 */
72static struct kmem_cache *posix_timers_cache; 64static struct kmem_cache *posix_timers_cache;
73static struct idr posix_timers_id; 65
74static DEFINE_SPINLOCK(idr_lock); 66static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
67static DEFINE_SPINLOCK(hash_lock);
75 68
76/* 69/*
77 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 70 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
152 __timr; \ 145 __timr; \
153}) 146})
154 147
148static int hash(struct signal_struct *sig, unsigned int nr)
149{
150 return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
151}
152
153static struct k_itimer *__posix_timers_find(struct hlist_head *head,
154 struct signal_struct *sig,
155 timer_t id)
156{
157 struct k_itimer *timer;
158
159 hlist_for_each_entry_rcu(timer, head, t_hash) {
160 if ((timer->it_signal == sig) && (timer->it_id == id))
161 return timer;
162 }
163 return NULL;
164}
165
166static struct k_itimer *posix_timer_by_id(timer_t id)
167{
168 struct signal_struct *sig = current->signal;
169 struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
170
171 return __posix_timers_find(head, sig, id);
172}
173
174static int posix_timer_add(struct k_itimer *timer)
175{
176 struct signal_struct *sig = current->signal;
177 int first_free_id = sig->posix_timer_id;
178 struct hlist_head *head;
179 int ret = -ENOENT;
180
181 do {
182 spin_lock(&hash_lock);
183 head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
184 if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
185 hlist_add_head_rcu(&timer->t_hash, head);
186 ret = sig->posix_timer_id;
187 }
188 if (++sig->posix_timer_id < 0)
189 sig->posix_timer_id = 0;
190 if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
191 /* Loop over all possible ids completed */
192 ret = -EAGAIN;
193 spin_unlock(&hash_lock);
194 } while (ret == -ENOENT);
195 return ret;
196}
197
155static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 198static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
156{ 199{
157 spin_unlock_irqrestore(&timr->it_lock, flags); 200 spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
221 return 0; 264 return 0;
222} 265}
223 266
267static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
268{
269 timekeeping_clocktai(tp);
270 return 0;
271}
224 272
225/* 273/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 274 * Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
261 .clock_getres = posix_get_coarse_res, 309 .clock_getres = posix_get_coarse_res,
262 .clock_get = posix_get_monotonic_coarse, 310 .clock_get = posix_get_monotonic_coarse,
263 }; 311 };
312 struct k_clock clock_tai = {
313 .clock_getres = hrtimer_get_res,
314 .clock_get = posix_get_tai,
315 .nsleep = common_nsleep,
316 .nsleep_restart = hrtimer_nanosleep_restart,
317 .timer_create = common_timer_create,
318 .timer_set = common_timer_set,
319 .timer_get = common_timer_get,
320 .timer_del = common_timer_del,
321 };
264 struct k_clock clock_boottime = { 322 struct k_clock clock_boottime = {
265 .clock_getres = hrtimer_get_res, 323 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime, 324 .clock_get = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 336 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 337 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); 338 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
339 posix_timers_register_clock(CLOCK_TAI, &clock_tai);
281 340
282 posix_timers_cache = kmem_cache_create("posix_timers_cache", 341 posix_timers_cache = kmem_cache_create("posix_timers_cache",
283 sizeof (struct k_itimer), 0, SLAB_PANIC, 342 sizeof (struct k_itimer), 0, SLAB_PANIC,
284 NULL); 343 NULL);
285 idr_init(&posix_timers_id);
286 return 0; 344 return 0;
287} 345}
288 346
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
504{ 562{
505 if (it_id_set) { 563 if (it_id_set) {
506 unsigned long flags; 564 unsigned long flags;
507 spin_lock_irqsave(&idr_lock, flags); 565 spin_lock_irqsave(&hash_lock, flags);
508 idr_remove(&posix_timers_id, tmr->it_id); 566 hlist_del_rcu(&tmr->t_hash);
509 spin_unlock_irqrestore(&idr_lock, flags); 567 spin_unlock_irqrestore(&hash_lock, flags);
510 } 568 }
511 put_pid(tmr->it_pid); 569 put_pid(tmr->it_pid);
512 sigqueue_free(tmr->sigq); 570 sigqueue_free(tmr->sigq);
@@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 610 return -EAGAIN;
553 611
554 spin_lock_init(&new_timer->it_lock); 612 spin_lock_init(&new_timer->it_lock);
555 613 new_timer_id = posix_timer_add(new_timer);
556 idr_preload(GFP_KERNEL); 614 if (new_timer_id < 0) {
557 spin_lock_irq(&idr_lock); 615 error = new_timer_id;
558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
559 spin_unlock_irq(&idr_lock);
560 idr_preload_end();
561 if (error < 0) {
562 /*
563 * Weird looking, but we return EAGAIN if the IDR is
564 * full (proper POSIX return value for this)
565 */
566 if (error == -ENOSPC)
567 error = -EAGAIN;
568 goto out; 616 goto out;
569 } 617 }
570 new_timer_id = error;
571 618
572 it_id_set = IT_ID_SET; 619 it_id_set = IT_ID_SET;
573 new_timer->it_id = (timer_t) new_timer_id; 620 new_timer->it_id = (timer_t) new_timer_id;
@@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
645 return NULL; 692 return NULL;
646 693
647 rcu_read_lock(); 694 rcu_read_lock();
648 timr = idr_find(&posix_timers_id, (int)timer_id); 695 timr = posix_timer_by_id(timer_id);
649 if (timr) { 696 if (timr) {
650 spin_lock_irqsave(&timr->it_lock, *flags); 697 spin_lock_irqsave(&timr->it_lock, *flags);
651 if (timr->it_signal == current->signal) { 698 if (timr->it_signal == current->signal) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b5..463aa6736751 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
4 * Originally from swsusp. 4 * Originally from swsusp.
5 */ 5 */
6 6
7#include <linux/console.h>
7#include <linux/vt_kern.h> 8#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 9#include <linux/kbd_kern.h>
9#include <linux/vt.h> 10#include <linux/vt.h>
@@ -14,8 +15,120 @@
14 15
15static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
16 17
18static DEFINE_MUTEX(vt_switch_mutex);
19
20struct pm_vt_switch {
21 struct list_head head;
22 struct device *dev;
23 bool required;
24};
25
26static LIST_HEAD(pm_vt_switch_list);
27
28
29/**
30 * pm_vt_switch_required - indicate VT switch at suspend requirements
31 * @dev: device
32 * @required: if true, caller needs VT switch at suspend/resume time
33 *
34 * The different console drivers may or may not require VT switches across
35 * suspend/resume, depending on how they handle restoring video state and
36 * what may be running.
37 *
38 * Drivers can indicate support for switchless suspend/resume, which can
39 * save time and flicker, by using this routine and passing 'false' as
40 * the argument. If any loaded driver needs VT switching, or the
41 * no_console_suspend argument has been passed on the command line, VT
42 * switches will occur.
43 */
44void pm_vt_switch_required(struct device *dev, bool required)
45{
46 struct pm_vt_switch *entry, *tmp;
47
48 mutex_lock(&vt_switch_mutex);
49 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
50 if (tmp->dev == dev) {
51 /* already registered, update requirement */
52 tmp->required = required;
53 goto out;
54 }
55 }
56
57 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
58 if (!entry)
59 goto out;
60
61 entry->required = required;
62 entry->dev = dev;
63
64 list_add(&entry->head, &pm_vt_switch_list);
65out:
66 mutex_unlock(&vt_switch_mutex);
67}
68EXPORT_SYMBOL(pm_vt_switch_required);
69
70/**
71 * pm_vt_switch_unregister - stop tracking a device's VT switching needs
72 * @dev: device
73 *
74 * Remove @dev from the vt switch list.
75 */
76void pm_vt_switch_unregister(struct device *dev)
77{
78 struct pm_vt_switch *tmp;
79
80 mutex_lock(&vt_switch_mutex);
81 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
82 if (tmp->dev == dev) {
83 list_del(&tmp->head);
84 break;
85 }
86 }
87 mutex_unlock(&vt_switch_mutex);
88}
89EXPORT_SYMBOL(pm_vt_switch_unregister);
90
91/*
92 * There are three cases when a VT switch on suspend/resume are required:
93 * 1) no driver has indicated a requirement one way or another, so preserve
94 * the old behavior
95 * 2) console suspend is disabled, we want to see debug messages across
96 * suspend/resume
97 * 3) any registered driver indicates it needs a VT switch
98 *
99 * If none of these conditions is present, meaning we have at least one driver
100 * that doesn't need the switch, and none that do, we can avoid it to make
101 * resume look a little prettier (and suspend too, but that's usually hidden,
102 * e.g. when closing the lid on a laptop).
103 */
104static bool pm_vt_switch(void)
105{
106 struct pm_vt_switch *entry;
107 bool ret = true;
108
109 mutex_lock(&vt_switch_mutex);
110 if (list_empty(&pm_vt_switch_list))
111 goto out;
112
113 if (!console_suspend_enabled)
114 goto out;
115
116 list_for_each_entry(entry, &pm_vt_switch_list, head) {
117 if (entry->required)
118 goto out;
119 }
120
121 ret = false;
122out:
123 mutex_unlock(&vt_switch_mutex);
124 return ret;
125}
126
17int pm_prepare_console(void) 127int pm_prepare_console(void)
18{ 128{
129 if (!pm_vt_switch())
130 return 0;
131
19 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); 132 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
20 if (orig_fgconsole < 0) 133 if (orig_fgconsole < 0)
21 return 1; 134 return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
26 139
27void pm_restore_console(void) 140void pm_restore_console(void)
28{ 141{
142 if (!pm_vt_switch())
143 return;
144
29 if (orig_fgconsole >= 0) { 145 if (orig_fgconsole >= 0) {
30 vt_move_to_console(orig_fgconsole, 0); 146 vt_move_to_console(orig_fgconsole, 0);
31 vt_kmsg_redirect(orig_kmsg); 147 vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc9..7ef6866b521d 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,7 +32,7 @@ static void handle_poweroff(int key)
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "poweroff(o)",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d4feda084a3a..bef86d121eb2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
76 76
77bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
78{ 78{
79 if (state == PM_SUSPEND_FREEZE) 79 if (state == PM_SUSPEND_FREEZE) {
80 return true; 80#ifdef CONFIG_PM_DEBUG
81 if (pm_test_level != TEST_NONE &&
82 pm_test_level != TEST_FREEZER &&
83 pm_test_level != TEST_DEVICES &&
84 pm_test_level != TEST_PLATFORM) {
85 printk(KERN_WARNING "Unsupported pm_test mode for "
86 "freeze state, please choose "
87 "none/freezer/devices/platform.\n");
88 return false;
89 }
90#endif
91 return true;
92 }
81 /* 93 /*
82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel 94 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel 95 * support and need to be valid to the lowlevel
@@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
184 goto Platform_wake; 196 goto Platform_wake;
185 } 197 }
186 198
199 if (suspend_test(TEST_PLATFORM))
200 goto Platform_wake;
201
187 /* 202 /*
188 * PM_SUSPEND_FREEZE equals 203 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors. 204 * frozen processes + suspended devices + idle processors.
@@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
195 goto Platform_wake; 210 goto Platform_wake;
196 } 211 }
197 212
198 if (suspend_test(TEST_PLATFORM))
199 goto Platform_wake;
200
201 error = disable_nonboot_cpus(); 213 error = disable_nonboot_cpus();
202 if (error || suspend_test(TEST_CPUS)) 214 if (error || suspend_test(TEST_CPUS))
203 goto Enable_cpus; 215 goto Enable_cpus;
diff --git a/kernel/printk.c b/kernel/printk.c
index 0b31715f335a..fa36e1494420 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h> 34#include <linux/memblock.h>
35#include <linux/aio.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
36#include <linux/kexec.h> 37#include <linux/kexec.h>
37#include <linux/kdb.h> 38#include <linux/kdb.h>
@@ -43,19 +44,13 @@
43#include <linux/rculist.h> 44#include <linux/rculist.h>
44#include <linux/poll.h> 45#include <linux/poll.h>
45#include <linux/irq_work.h> 46#include <linux/irq_work.h>
47#include <linux/utsname.h>
46 48
47#include <asm/uaccess.h> 49#include <asm/uaccess.h>
48 50
49#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
50#include <trace/events/printk.h> 52#include <trace/events/printk.h>
51 53
52/*
53 * Architectures can override it:
54 */
55void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
56{
57}
58
59/* printk's without a loglevel use this.. */ 54/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 55#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 56
@@ -63,8 +58,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
63#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 58#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
64#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ 59#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
65 60
66DECLARE_WAIT_QUEUE_HEAD(log_wait);
67
68int console_printk[4] = { 61int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 62 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
70 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 63 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -224,6 +217,7 @@ struct log {
224static DEFINE_RAW_SPINLOCK(logbuf_lock); 217static DEFINE_RAW_SPINLOCK(logbuf_lock);
225 218
226#ifdef CONFIG_PRINTK 219#ifdef CONFIG_PRINTK
220DECLARE_WAIT_QUEUE_HEAD(log_wait);
227/* the next printk record to read by syslog(READ) or /proc/kmsg */ 221/* the next printk record to read by syslog(READ) or /proc/kmsg */
228static u64 syslog_seq; 222static u64 syslog_seq;
229static u32 syslog_idx; 223static u32 syslog_idx;
@@ -609,7 +603,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
609 /* return error when data has vanished underneath us */ 603 /* return error when data has vanished underneath us */
610 if (user->seq < log_first_seq) 604 if (user->seq < log_first_seq)
611 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 605 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
612 ret = POLLIN|POLLRDNORM; 606 else
607 ret = POLLIN|POLLRDNORM;
613 } 608 }
614 raw_spin_unlock_irq(&logbuf_lock); 609 raw_spin_unlock_irq(&logbuf_lock);
615 610
@@ -1266,7 +1261,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
1266{ 1261{
1267 struct console *con; 1262 struct console *con;
1268 1263
1269 trace_console(text, 0, len, len); 1264 trace_console(text, len);
1270 1265
1271 if (level >= console_loglevel && !ignore_loglevel) 1266 if (level >= console_loglevel && !ignore_loglevel)
1272 return; 1267 return;
@@ -1724,6 +1719,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
1724 1719
1725#endif /* CONFIG_PRINTK */ 1720#endif /* CONFIG_PRINTK */
1726 1721
1722#ifdef CONFIG_EARLY_PRINTK
1723struct console *early_console;
1724
1725void early_vprintk(const char *fmt, va_list ap)
1726{
1727 if (early_console) {
1728 char buf[512];
1729 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1730
1731 early_console->write(early_console, buf, n);
1732 }
1733}
1734
1735asmlinkage void early_printk(const char *fmt, ...)
1736{
1737 va_list ap;
1738
1739 va_start(ap, fmt);
1740 early_vprintk(fmt, ap);
1741 va_end(ap);
1742}
1743#endif
1744
1727static int __add_preferred_console(char *name, int idx, char *options, 1745static int __add_preferred_console(char *name, int idx, char *options,
1728 char *brl_options) 1746 char *brl_options)
1729{ 1747{
@@ -1957,45 +1975,6 @@ int is_console_locked(void)
1957 return console_locked; 1975 return console_locked;
1958} 1976}
1959 1977
1960/*
1961 * Delayed printk version, for scheduler-internal messages:
1962 */
1963#define PRINTK_BUF_SIZE 512
1964
1965#define PRINTK_PENDING_WAKEUP 0x01
1966#define PRINTK_PENDING_SCHED 0x02
1967
1968static DEFINE_PER_CPU(int, printk_pending);
1969static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1970
1971static void wake_up_klogd_work_func(struct irq_work *irq_work)
1972{
1973 int pending = __this_cpu_xchg(printk_pending, 0);
1974
1975 if (pending & PRINTK_PENDING_SCHED) {
1976 char *buf = __get_cpu_var(printk_sched_buf);
1977 printk(KERN_WARNING "[sched_delayed] %s", buf);
1978 }
1979
1980 if (pending & PRINTK_PENDING_WAKEUP)
1981 wake_up_interruptible(&log_wait);
1982}
1983
1984static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
1985 .func = wake_up_klogd_work_func,
1986 .flags = IRQ_WORK_LAZY,
1987};
1988
1989void wake_up_klogd(void)
1990{
1991 preempt_disable();
1992 if (waitqueue_active(&log_wait)) {
1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1994 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
1995 }
1996 preempt_enable();
1997}
1998
1999static void console_cont_flush(char *text, size_t size) 1978static void console_cont_flush(char *text, size_t size)
2000{ 1979{
2001 unsigned long flags; 1980 unsigned long flags;
@@ -2458,6 +2437,44 @@ static int __init printk_late_init(void)
2458late_initcall(printk_late_init); 2437late_initcall(printk_late_init);
2459 2438
2460#if defined CONFIG_PRINTK 2439#if defined CONFIG_PRINTK
2440/*
2441 * Delayed printk version, for scheduler-internal messages:
2442 */
2443#define PRINTK_BUF_SIZE 512
2444
2445#define PRINTK_PENDING_WAKEUP 0x01
2446#define PRINTK_PENDING_SCHED 0x02
2447
2448static DEFINE_PER_CPU(int, printk_pending);
2449static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2450
2451static void wake_up_klogd_work_func(struct irq_work *irq_work)
2452{
2453 int pending = __this_cpu_xchg(printk_pending, 0);
2454
2455 if (pending & PRINTK_PENDING_SCHED) {
2456 char *buf = __get_cpu_var(printk_sched_buf);
2457 printk(KERN_WARNING "[sched_delayed] %s", buf);
2458 }
2459
2460 if (pending & PRINTK_PENDING_WAKEUP)
2461 wake_up_interruptible(&log_wait);
2462}
2463
2464static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
2465 .func = wake_up_klogd_work_func,
2466 .flags = IRQ_WORK_LAZY,
2467};
2468
2469void wake_up_klogd(void)
2470{
2471 preempt_disable();
2472 if (waitqueue_active(&log_wait)) {
2473 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
2474 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2475 }
2476 preempt_enable();
2477}
2461 2478
2462int printk_sched(const char *fmt, ...) 2479int printk_sched(const char *fmt, ...)
2463{ 2480{
@@ -2834,4 +2851,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2834 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2851 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2835} 2852}
2836EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 2853EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2854
2855static char dump_stack_arch_desc_str[128];
2856
2857/**
2858 * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
2859 * @fmt: printf-style format string
2860 * @...: arguments for the format string
2861 *
2862 * The configured string will be printed right after utsname during task
2863 * dumps. Usually used to add arch-specific system identifiers. If an
2864 * arch wants to make use of such an ID string, it should initialize this
2865 * as soon as possible during boot.
2866 */
2867void __init dump_stack_set_arch_desc(const char *fmt, ...)
2868{
2869 va_list args;
2870
2871 va_start(args, fmt);
2872 vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
2873 fmt, args);
2874 va_end(args);
2875}
2876
2877/**
2878 * dump_stack_print_info - print generic debug info for dump_stack()
2879 * @log_lvl: log level
2880 *
2881 * Arch-specific dump_stack() implementations can use this function to
2882 * print out the same debug information as the generic dump_stack().
2883 */
2884void dump_stack_print_info(const char *log_lvl)
2885{
2886 printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
2887 log_lvl, raw_smp_processor_id(), current->pid, current->comm,
2888 print_tainted(), init_utsname()->release,
2889 (int)strcspn(init_utsname()->version, " "),
2890 init_utsname()->version);
2891
2892 if (dump_stack_arch_desc_str[0] != '\0')
2893 printk("%sHardware name: %s\n",
2894 log_lvl, dump_stack_arch_desc_str);
2895
2896 print_worker_info(log_lvl, current);
2897}
2898
2899/**
2900 * show_regs_print_info - print generic debug info for show_regs()
2901 * @log_lvl: log level
2902 *
2903 * show_regs() implementations can use this function to print out generic
2904 * debug information.
2905 */
2906void show_regs_print_info(const char *log_lvl)
2907{
2908 dump_stack_print_info(log_lvl);
2909
2910 printk("%stask: %p ti: %p task.ti: %p\n",
2911 log_lvl, current, current_thread_info(),
2912 task_thread_info(current));
2913}
2914
2837#endif 2915#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index dc3384ee874e..0bf400737660 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
462 .write = prof_cpu_mask_proc_write, 462 .write = prof_cpu_mask_proc_write,
463}; 463};
464 464
465void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 465void create_prof_cpu_mask(void)
466{ 466{
467 /* create /proc/irq/prof_cpu_mask */ 467 /* create /proc/irq/prof_cpu_mask */
468 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); 468 proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
469} 469}
470 470
471/* 471/*
@@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
600 NULL, &proc_profile_operations); 600 NULL, &proc_profile_operations);
601 if (!entry) 601 if (!entry)
602 return 0; 602 return 0;
603 entry->size = (1+prof_len) * sizeof(atomic_t); 603 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
604 hotcpu_notifier(profile_cpu_callback, 0); 604 hotcpu_notifier(profile_cpu_callback, 0);
605 return 0; 605 return 0;
606} 606}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index acbd28424d81..aed981a3f69c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/signal.h> 19#include <linux/signal.h>
20#include <linux/uio.h>
20#include <linux/audit.h> 21#include <linux/audit.h>
21#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
22#include <linux/syscalls.h> 23#include <linux/syscalls.h>
@@ -24,6 +25,7 @@
24#include <linux/regset.h> 25#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 26#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h> 27#include <linux/cn_proc.h>
28#include <linux/compat.h>
27 29
28 30
29static int ptrace_trapping_sleep_fn(void *flags) 31static int ptrace_trapping_sleep_fn(void *flags)
@@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
618 return error; 620 return error;
619} 621}
620 622
623static int ptrace_peek_siginfo(struct task_struct *child,
624 unsigned long addr,
625 unsigned long data)
626{
627 struct ptrace_peeksiginfo_args arg;
628 struct sigpending *pending;
629 struct sigqueue *q;
630 int ret, i;
631
632 ret = copy_from_user(&arg, (void __user *) addr,
633 sizeof(struct ptrace_peeksiginfo_args));
634 if (ret)
635 return -EFAULT;
636
637 if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
638 return -EINVAL; /* unknown flags */
639
640 if (arg.nr < 0)
641 return -EINVAL;
642
643 if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
644 pending = &child->signal->shared_pending;
645 else
646 pending = &child->pending;
647
648 for (i = 0; i < arg.nr; ) {
649 siginfo_t info;
650 s32 off = arg.off + i;
651
652 spin_lock_irq(&child->sighand->siglock);
653 list_for_each_entry(q, &pending->list, list) {
654 if (!off--) {
655 copy_siginfo(&info, &q->info);
656 break;
657 }
658 }
659 spin_unlock_irq(&child->sighand->siglock);
660
661 if (off >= 0) /* beyond the end of the list */
662 break;
663
664#ifdef CONFIG_COMPAT
665 if (unlikely(is_compat_task())) {
666 compat_siginfo_t __user *uinfo = compat_ptr(data);
667
668 ret = copy_siginfo_to_user32(uinfo, &info);
669 ret |= __put_user(info.si_code, &uinfo->si_code);
670 } else
671#endif
672 {
673 siginfo_t __user *uinfo = (siginfo_t __user *) data;
674
675 ret = copy_siginfo_to_user(uinfo, &info);
676 ret |= __put_user(info.si_code, &uinfo->si_code);
677 }
678
679 if (ret) {
680 ret = -EFAULT;
681 break;
682 }
683
684 data += sizeof(siginfo_t);
685 i++;
686
687 if (signal_pending(current))
688 break;
689
690 cond_resched();
691 }
692
693 if (i > 0)
694 return i;
695
696 return ret;
697}
621 698
622#ifdef PTRACE_SINGLESTEP 699#ifdef PTRACE_SINGLESTEP
623#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) 700#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
@@ -748,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request,
748 ret = put_user(child->ptrace_message, datalp); 825 ret = put_user(child->ptrace_message, datalp);
749 break; 826 break;
750 827
828 case PTRACE_PEEKSIGINFO:
829 ret = ptrace_peek_siginfo(child, addr, data);
830 break;
831
751 case PTRACE_GETSIGINFO: 832 case PTRACE_GETSIGINFO:
752 ret = ptrace_getsiginfo(child, &siginfo); 833 ret = ptrace_getsiginfo(child, &siginfo);
753 if (!ret) 834 if (!ret)
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed68..eb911dbce267 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
48 final_start = min(range[i].start, start); 48 final_start = min(range[i].start, start);
49 final_end = max(range[i].end, end); 49 final_end = max(range[i].end, end);
50 50
51 range[i].start = final_start; 51 /* clear it and add it back for further merge */
52 range[i].end = final_end; 52 range[i].start = 0;
53 return nr_range; 53 range[i].end = 0;
54 return add_range_with_merge(range, az, nr_range,
55 final_start, final_end);
54 } 56 }
55 57
56 /* Need to add it: */ 58 /* Need to add it: */
@@ -97,7 +99,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
97 range[i].end = range[j].end; 99 range[i].end = range[j].end;
98 range[i].start = end; 100 range[i].start = end;
99 } else { 101 } else {
100 printk(KERN_ERR "run of slot in ranges\n"); 102 pr_err("%s: run out of slot in ranges\n",
103 __func__);
101 } 104 }
102 range[j].end = start; 105 range[j].end = start;
103 continue; 106 continue;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5b8ad827fd86..16ea67925015 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 66
67#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 78 .name = #sname, \
79 .abbr = sabbr, \
79} 80}
80 81
81struct rcu_state rcu_sched_state = 82struct rcu_state rcu_sched_state =
82 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); 83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84 85
85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); 86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87 88
88static struct rcu_state *rcu_state; 89static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
223module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
224module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
225 226
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp);
226static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
227static void force_quiescent_state(struct rcu_state *rsp); 230static void force_quiescent_state(struct rcu_state *rsp);
228static int rcu_pending(int cpu); 231static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
310 313
311 if (rcu_gp_in_progress(rsp)) 314 if (rcu_gp_in_progress(rsp))
312 return 0; /* No, a grace period is already in progress. */ 315 return 0; /* No, a grace period is already in progress. */
316 if (rcu_nocb_needs_gp(rsp))
317 return 1; /* Yes, a no-CBs CPU needs one. */
313 if (!rdp->nxttail[RCU_NEXT_TAIL]) 318 if (!rdp->nxttail[RCU_NEXT_TAIL])
314 return 0; /* No, this is a no-CBs (or offline) CPU. */ 319 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 320 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -794,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
794 rdp->offline_fqs++; 799 rdp->offline_fqs++;
795 return 1; 800 return 1;
796 } 801 }
802
803 /*
804 * There is a possibility that a CPU in adaptive-ticks state
805 * might run in the kernel with the scheduling-clock tick disabled
806 * for an extended time period. Invoke rcu_kick_nohz_cpu() to
807 * force the CPU to restart the scheduling-clock tick in this
808 * CPU is in this state.
809 */
810 rcu_kick_nohz_cpu(rdp->cpu);
811
797 return 0; 812 return 0;
798} 813}
799 814
@@ -1035,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp)
1035{ 1050{
1036 int i; 1051 int i;
1037 1052
1053 if (init_nocb_callback_list(rdp))
1054 return;
1038 rdp->nxtlist = NULL; 1055 rdp->nxtlist = NULL;
1039 for (i = 0; i < RCU_NEXT_SIZE; i++) 1056 for (i = 0; i < RCU_NEXT_SIZE; i++)
1040 rdp->nxttail[i] = &rdp->nxtlist; 1057 rdp->nxttail[i] = &rdp->nxtlist;
1041 init_nocb_callback_list(rdp);
1042} 1058}
1043 1059
1044/* 1060/*
@@ -1071,6 +1087,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1071} 1087}
1072 1088
1073/* 1089/*
1090 * Trace-event helper function for rcu_start_future_gp() and
1091 * rcu_nocb_wait_gp().
1092 */
1093static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1094 unsigned long c, char *s)
1095{
1096 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1097 rnp->completed, c, rnp->level,
1098 rnp->grplo, rnp->grphi, s);
1099}
1100
1101/*
1102 * Start some future grace period, as needed to handle newly arrived
1103 * callbacks. The required future grace periods are recorded in each
1104 * rcu_node structure's ->need_future_gp field.
1105 *
1106 * The caller must hold the specified rcu_node structure's ->lock.
1107 */
1108static unsigned long __maybe_unused
1109rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1110{
1111 unsigned long c;
1112 int i;
1113 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1114
1115 /*
1116 * Pick up grace-period number for new callbacks. If this
1117 * grace period is already marked as needed, return to the caller.
1118 */
1119 c = rcu_cbs_completed(rdp->rsp, rnp);
1120 trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
1121 if (rnp->need_future_gp[c & 0x1]) {
1122 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
1123 return c;
1124 }
1125
1126 /*
1127 * If either this rcu_node structure or the root rcu_node structure
1128 * believe that a grace period is in progress, then we must wait
1129 * for the one following, which is in "c". Because our request
1130 * will be noticed at the end of the current grace period, we don't
1131 * need to explicitly start one.
1132 */
1133 if (rnp->gpnum != rnp->completed ||
1134 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1135 rnp->need_future_gp[c & 0x1]++;
1136 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
1137 return c;
1138 }
1139
1140 /*
1141 * There might be no grace period in progress. If we don't already
1142 * hold it, acquire the root rcu_node structure's lock in order to
1143 * start one (if needed).
1144 */
1145 if (rnp != rnp_root)
1146 raw_spin_lock(&rnp_root->lock);
1147
1148 /*
1149 * Get a new grace-period number. If there really is no grace
1150 * period in progress, it will be smaller than the one we obtained
1151 * earlier. Adjust callbacks as needed. Note that even no-CBs
1152 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1153 */
1154 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1155 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
1156 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
1157 rdp->nxtcompleted[i] = c;
1158
1159 /*
1160 * If the needed for the required grace period is already
1161 * recorded, trace and leave.
1162 */
1163 if (rnp_root->need_future_gp[c & 0x1]) {
1164 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
1165 goto unlock_out;
1166 }
1167
1168 /* Record the need for the future grace period. */
1169 rnp_root->need_future_gp[c & 0x1]++;
1170
1171 /* If a grace period is not already in progress, start one. */
1172 if (rnp_root->gpnum != rnp_root->completed) {
1173 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
1174 } else {
1175 trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
1176 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1177 }
1178unlock_out:
1179 if (rnp != rnp_root)
1180 raw_spin_unlock(&rnp_root->lock);
1181 return c;
1182}
1183
1184/*
1185 * Clean up any old requests for the just-ended grace period. Also return
1186 * whether any additional grace periods have been requested. Also invoke
1187 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1188 * waiting for this grace period to complete.
1189 */
1190static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1191{
1192 int c = rnp->completed;
1193 int needmore;
1194 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1195
1196 rcu_nocb_gp_cleanup(rsp, rnp);
1197 rnp->need_future_gp[c & 0x1] = 0;
1198 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1199 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
1200 return needmore;
1201}
1202
1203/*
1074 * If there is room, assign a ->completed number to any callbacks on 1204 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any 1205 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has 1206 * callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1259 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c; 1260 rdp->nxtcompleted[i] = c;
1131 } 1261 }
1262 /* Record any needed additional grace periods. */
1263 rcu_start_future_gp(rnp, rdp);
1132 1264
1133 /* Trace depending on how much we were able to accelerate. */ 1265 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1266 if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1308,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
1308 rdp = this_cpu_ptr(rsp->rda); 1440 rdp = this_cpu_ptr(rsp->rda);
1309 rcu_preempt_check_blocked_tasks(rnp); 1441 rcu_preempt_check_blocked_tasks(rnp);
1310 rnp->qsmask = rnp->qsmaskinit; 1442 rnp->qsmask = rnp->qsmaskinit;
1311 rnp->gpnum = rsp->gpnum; 1443 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1312 WARN_ON_ONCE(rnp->completed != rsp->completed); 1444 WARN_ON_ONCE(rnp->completed != rsp->completed);
1313 rnp->completed = rsp->completed; 1445 ACCESS_ONCE(rnp->completed) = rsp->completed;
1314 if (rnp == rdp->mynode) 1446 if (rnp == rdp->mynode)
1315 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1447 rcu_start_gp_per_cpu(rsp, rnp, rdp);
1316 rcu_preempt_boost_start_gp(rnp); 1448 rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1319 rnp->grphi, rnp->qsmask); 1451 rnp->grphi, rnp->qsmask);
1320 raw_spin_unlock_irq(&rnp->lock); 1452 raw_spin_unlock_irq(&rnp->lock);
1321#ifdef CONFIG_PROVE_RCU_DELAY 1453#ifdef CONFIG_PROVE_RCU_DELAY
1322 if ((random32() % (rcu_num_nodes * 8)) == 0) 1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
1455 system_state == SYSTEM_RUNNING)
1323 schedule_timeout_uninterruptible(2); 1456 schedule_timeout_uninterruptible(2);
1324#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1457#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325 cond_resched(); 1458 cond_resched();
@@ -1361,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1361static void rcu_gp_cleanup(struct rcu_state *rsp) 1494static void rcu_gp_cleanup(struct rcu_state *rsp)
1362{ 1495{
1363 unsigned long gp_duration; 1496 unsigned long gp_duration;
1497 int nocb = 0;
1364 struct rcu_data *rdp; 1498 struct rcu_data *rdp;
1365 struct rcu_node *rnp = rcu_get_root(rsp); 1499 struct rcu_node *rnp = rcu_get_root(rsp);
1366 1500
@@ -1390,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1390 */ 1524 */
1391 rcu_for_each_node_breadth_first(rsp, rnp) { 1525 rcu_for_each_node_breadth_first(rsp, rnp) {
1392 raw_spin_lock_irq(&rnp->lock); 1526 raw_spin_lock_irq(&rnp->lock);
1393 rnp->completed = rsp->gpnum; 1527 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1528 rdp = this_cpu_ptr(rsp->rda);
1529 if (rnp == rdp->mynode)
1530 __rcu_process_gp_end(rsp, rnp, rdp);
1531 nocb += rcu_future_gp_cleanup(rsp, rnp);
1394 raw_spin_unlock_irq(&rnp->lock); 1532 raw_spin_unlock_irq(&rnp->lock);
1395 cond_resched(); 1533 cond_resched();
1396 } 1534 }
1397 rnp = rcu_get_root(rsp); 1535 rnp = rcu_get_root(rsp);
1398 raw_spin_lock_irq(&rnp->lock); 1536 raw_spin_lock_irq(&rnp->lock);
1537 rcu_nocb_gp_set(rnp, nocb);
1399 1538
1400 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1539 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1401 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1540 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1402 rsp->fqs_state = RCU_GP_IDLE; 1541 rsp->fqs_state = RCU_GP_IDLE;
1403 rdp = this_cpu_ptr(rsp->rda); 1542 rdp = this_cpu_ptr(rsp->rda);
1543 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1404 if (cpu_needs_another_gp(rsp, rdp)) 1544 if (cpu_needs_another_gp(rsp, rdp))
1405 rsp->gp_flags = 1; 1545 rsp->gp_flags = 1;
1406 raw_spin_unlock_irq(&rnp->lock); 1546 raw_spin_unlock_irq(&rnp->lock);
@@ -1476,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
1476/* 1616/*
1477 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1617 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1478 * in preparation for detecting the next grace period. The caller must hold 1618 * in preparation for detecting the next grace period. The caller must hold
1479 * the root node's ->lock, which is released before return. Hard irqs must 1619 * the root node's ->lock and hard irqs must be disabled.
1480 * be disabled.
1481 * 1620 *
1482 * Note that it is legal for a dying CPU (which is marked as offline) to 1621 * Note that it is legal for a dying CPU (which is marked as offline) to
1483 * invoke this function. This can happen when the dying CPU reports its 1622 * invoke this function. This can happen when the dying CPU reports its
1484 * quiescent state. 1623 * quiescent state.
1485 */ 1624 */
1486static void 1625static void
1487rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1626rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1488 __releases(rcu_get_root(rsp)->lock) 1627 struct rcu_data *rdp)
1489{ 1628{
1490 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1629 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
1491 struct rcu_node *rnp = rcu_get_root(rsp);
1492
1493 if (!rsp->gp_kthread ||
1494 !cpu_needs_another_gp(rsp, rdp)) {
1495 /* 1630 /*
1496 * Either we have not yet spawned the grace-period 1631 * Either we have not yet spawned the grace-period
1497 * task, this CPU does not need another grace period, 1632 * task, this CPU does not need another grace period,
1498 * or a grace period is already in progress. 1633 * or a grace period is already in progress.
1499 * Either way, don't start a new grace period. 1634 * Either way, don't start a new grace period.
1500 */ 1635 */
1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1502 return; 1636 return;
1503 } 1637 }
1504
1505 /*
1506 * Because there is no grace period in progress right now,
1507 * any callbacks we have up to this point will be satisfied
1508 * by the next grace period. So this is a good place to
1509 * assign a grace period number to recently posted callbacks.
1510 */
1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1512
1513 rsp->gp_flags = RCU_GP_FLAG_INIT; 1638 rsp->gp_flags = RCU_GP_FLAG_INIT;
1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515
1516 /* Ensure that CPU is aware of completion of last grace period. */
1517 rcu_process_gp_end(rsp, rdp);
1518 local_irq_restore(flags);
1519 1639
1520 /* Wake up rcu_gp_kthread() to start the grace period. */ 1640 /* Wake up rcu_gp_kthread() to start the grace period. */
1521 wake_up(&rsp->gp_wq); 1641 wake_up(&rsp->gp_wq);
1522} 1642}
1523 1643
1524/* 1644/*
1645 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
1646 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
1647 * is invoked indirectly from rcu_advance_cbs(), which would result in
1648 * endless recursion -- or would do so if it wasn't for the self-deadlock
1649 * that is encountered beforehand.
1650 */
1651static void
1652rcu_start_gp(struct rcu_state *rsp)
1653{
1654 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1655 struct rcu_node *rnp = rcu_get_root(rsp);
1656
1657 /*
1658 * If there is no grace period in progress right now, any
1659 * callbacks we have up to this point will be satisfied by the
1660 * next grace period. Also, advancing the callbacks reduces the
1661 * probability of false positives from cpu_needs_another_gp()
1662 * resulting in pointless grace periods. So, advance callbacks
1663 * then start the grace period!
1664 */
1665 rcu_advance_cbs(rsp, rnp, rdp);
1666 rcu_start_gp_advanced(rsp, rnp, rdp);
1667}
1668
1669/*
1525 * Report a full set of quiescent states to the specified rcu_state 1670 * Report a full set of quiescent states to the specified rcu_state
1526 * data structure. This involves cleaning up after the prior grace 1671 * data structure. This involves cleaning up after the prior grace
1527 * period and letting rcu_start_gp() start up the next grace period 1672 * period and letting rcu_start_gp() start up the next grace period
1528 * if one is needed. Note that the caller must hold rnp->lock, as 1673 * if one is needed. Note that the caller must hold rnp->lock, which
1529 * required by rcu_start_gp(), which will release it. 1674 * is released before return.
1530 */ 1675 */
1531static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1676static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1532 __releases(rcu_get_root(rsp)->lock) 1677 __releases(rcu_get_root(rsp)->lock)
@@ -1685,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1685 struct rcu_node *rnp, struct rcu_data *rdp) 1830 struct rcu_node *rnp, struct rcu_data *rdp)
1686{ 1831{
1687 /* No-CBs CPUs do not have orphanable callbacks. */ 1832 /* No-CBs CPUs do not have orphanable callbacks. */
1688 if (is_nocb_cpu(rdp->cpu)) 1833 if (rcu_is_nocb_cpu(rdp->cpu))
1689 return; 1834 return;
1690 1835
1691 /* 1836 /*
@@ -2124,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2124 local_irq_save(flags); 2269 local_irq_save(flags);
2125 if (cpu_needs_another_gp(rsp, rdp)) { 2270 if (cpu_needs_another_gp(rsp, rdp)) {
2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2271 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2127 rcu_start_gp(rsp, flags); /* releases above lock */ 2272 rcu_start_gp(rsp);
2273 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2128 } else { 2274 } else {
2129 local_irq_restore(flags); 2275 local_irq_restore(flags);
2130 } 2276 }
@@ -2169,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2169 2315
2170static void invoke_rcu_core(void) 2316static void invoke_rcu_core(void)
2171{ 2317{
2172 raise_softirq(RCU_SOFTIRQ); 2318 if (cpu_online(smp_processor_id()))
2319 raise_softirq(RCU_SOFTIRQ);
2173} 2320}
2174 2321
2175/* 2322/*
@@ -2204,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2204 2351
2205 /* Start a new grace period if one not already started. */ 2352 /* Start a new grace period if one not already started. */
2206 if (!rcu_gp_in_progress(rsp)) { 2353 if (!rcu_gp_in_progress(rsp)) {
2207 unsigned long nestflag;
2208 struct rcu_node *rnp_root = rcu_get_root(rsp); 2354 struct rcu_node *rnp_root = rcu_get_root(rsp);
2209 2355
2210 raw_spin_lock_irqsave(&rnp_root->lock, nestflag); 2356 raw_spin_lock(&rnp_root->lock);
2211 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ 2357 rcu_start_gp(rsp);
2358 raw_spin_unlock(&rnp_root->lock);
2212 } else { 2359 } else {
2213 /* Give the grace period a kick. */ 2360 /* Give the grace period a kick. */
2214 rdp->blimit = LONG_MAX; 2361 rdp->blimit = LONG_MAX;
@@ -2628,19 +2775,27 @@ static int rcu_pending(int cpu)
2628} 2775}
2629 2776
2630/* 2777/*
2631 * Check to see if any future RCU-related work will need to be done 2778 * Return true if the specified CPU has any callback. If all_lazy is
2632 * by the current CPU, even if none need be done immediately, returning 2779 * non-NULL, store an indication of whether all callbacks are lazy.
2633 * 1 if so. 2780 * (If there are no callbacks, all of them are deemed to be lazy.)
2634 */ 2781 */
2635static int rcu_cpu_has_callbacks(int cpu) 2782static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2636{ 2783{
2784 bool al = true;
2785 bool hc = false;
2786 struct rcu_data *rdp;
2637 struct rcu_state *rsp; 2787 struct rcu_state *rsp;
2638 2788
2639 /* RCU callbacks either ready or pending? */ 2789 for_each_rcu_flavor(rsp) {
2640 for_each_rcu_flavor(rsp) 2790 rdp = per_cpu_ptr(rsp->rda, cpu);
2641 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) 2791 if (rdp->qlen != rdp->qlen_lazy)
2642 return 1; 2792 al = false;
2643 return 0; 2793 if (rdp->nxtlist)
2794 hc = true;
2795 }
2796 if (all_lazy)
2797 *all_lazy = al;
2798 return hc;
2644} 2799}
2645 2800
2646/* 2801/*
@@ -2747,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
2747 * corresponding CPU's preceding callbacks have been invoked. 2902 * corresponding CPU's preceding callbacks have been invoked.
2748 */ 2903 */
2749 for_each_possible_cpu(cpu) { 2904 for_each_possible_cpu(cpu) {
2750 if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) 2905 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
2751 continue; 2906 continue;
2752 rdp = per_cpu_ptr(rsp->rda, cpu); 2907 rdp = per_cpu_ptr(rsp->rda, cpu);
2753 if (is_nocb_cpu(cpu)) { 2908 if (rcu_is_nocb_cpu(cpu)) {
2754 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 2909 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2755 rsp->n_barrier_done); 2910 rsp->n_barrier_done);
2756 atomic_inc(&rsp->barrier_cpu_count); 2911 atomic_inc(&rsp->barrier_cpu_count);
@@ -2859,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2859 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3014 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2860 atomic_set(&rdp->dynticks->dynticks, 3015 atomic_set(&rdp->dynticks->dynticks,
2861 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3016 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2862 rcu_prepare_for_idle_init(cpu);
2863 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3017 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2864 3018
2865 /* Add CPU to rcu_node bitmasks. */ 3019 /* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2909 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3063 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2910 struct rcu_node *rnp = rdp->mynode; 3064 struct rcu_node *rnp = rdp->mynode;
2911 struct rcu_state *rsp; 3065 struct rcu_state *rsp;
2912 int ret = NOTIFY_OK;
2913 3066
2914 trace_rcu_utilization("Start CPU hotplug"); 3067 trace_rcu_utilization("Start CPU hotplug");
2915 switch (action) { 3068 switch (action) {
@@ -2923,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2923 rcu_boost_kthread_setaffinity(rnp, -1); 3076 rcu_boost_kthread_setaffinity(rnp, -1);
2924 break; 3077 break;
2925 case CPU_DOWN_PREPARE: 3078 case CPU_DOWN_PREPARE:
2926 if (nocb_cpu_expendable(cpu)) 3079 rcu_boost_kthread_setaffinity(rnp, cpu);
2927 rcu_boost_kthread_setaffinity(rnp, cpu);
2928 else
2929 ret = NOTIFY_BAD;
2930 break; 3080 break;
2931 case CPU_DYING: 3081 case CPU_DYING:
2932 case CPU_DYING_FROZEN: 3082 case CPU_DYING_FROZEN:
2933 /*
2934 * The whole machine is "stopped" except this CPU, so we can
2935 * touch any data without introducing corruption. We send the
2936 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2937 */
2938 for_each_rcu_flavor(rsp) 3083 for_each_rcu_flavor(rsp)
2939 rcu_cleanup_dying_cpu(rsp); 3084 rcu_cleanup_dying_cpu(rsp);
2940 rcu_cleanup_after_idle(cpu);
2941 break; 3085 break;
2942 case CPU_DEAD: 3086 case CPU_DEAD:
2943 case CPU_DEAD_FROZEN: 3087 case CPU_DEAD_FROZEN:
@@ -2950,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2950 break; 3094 break;
2951 } 3095 }
2952 trace_rcu_utilization("End CPU hotplug"); 3096 trace_rcu_utilization("End CPU hotplug");
2953 return ret; 3097 return NOTIFY_OK;
2954} 3098}
2955 3099
2956/* 3100/*
@@ -3085,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3085 } 3229 }
3086 rnp->level = i; 3230 rnp->level = i;
3087 INIT_LIST_HEAD(&rnp->blkd_tasks); 3231 INIT_LIST_HEAD(&rnp->blkd_tasks);
3232 rcu_init_one_nocb(rnp);
3088 } 3233 }
3089 } 3234 }
3090 3235
@@ -3170,8 +3315,7 @@ void __init rcu_init(void)
3170 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3315 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3171 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3316 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3172 __rcu_init_preempt(); 3317 __rcu_init_preempt();
3173 rcu_init_nocb(); 3318 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3174 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3175 3319
3176 /* 3320 /*
3177 * We don't need protection against CPU-hotplug here because 3321 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c896b5045d9d..da77a8f57ff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 atomic_t dynticks; /* Even value for idle, else odd. */ 89 atomic_t dynticks; /* Even value for idle, else odd. */
90#ifdef CONFIG_RCU_FAST_NO_HZ 90#ifdef CONFIG_RCU_FAST_NO_HZ
91 int dyntick_drain; /* Prepare-for-idle state variable. */ 91 bool all_lazy; /* Are all CPU's CBs lazy? */
92 unsigned long dyntick_holdoff;
93 /* No retries for the jiffy of failure. */
94 struct timer_list idle_gp_timer;
95 /* Wake up CPU sleeping with callbacks. */
96 unsigned long idle_gp_timer_expires;
97 /* When to wake up CPU (for repost). */
98 bool idle_first_pass; /* First pass of attempt to go idle? */
99 unsigned long nonlazy_posted; 92 unsigned long nonlazy_posted;
100 /* # times non-lazy CBs posted to CPU. */ 93 /* # times non-lazy CBs posted to CPU. */
101 unsigned long nonlazy_posted_snap; 94 unsigned long nonlazy_posted_snap;
102 /* idle-period nonlazy_posted snapshot. */ 95 /* idle-period nonlazy_posted snapshot. */
96 unsigned long last_accelerate;
97 /* Last jiffy CBs were accelerated. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 98 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 99#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105}; 100};
@@ -134,9 +129,6 @@ struct rcu_node {
134 /* elements that need to drain to allow the */ 129 /* elements that need to drain to allow the */
135 /* current expedited grace period to */ 130 /* current expedited grace period to */
136 /* complete (only for TREE_PREEMPT_RCU). */ 131 /* complete (only for TREE_PREEMPT_RCU). */
137 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
138 /* Since this has meaning only for leaf */
139 /* rcu_node structures, 32 bits suffices. */
140 unsigned long qsmaskinit; 132 unsigned long qsmaskinit;
141 /* Per-GP initial value for qsmask & expmask. */ 133 /* Per-GP initial value for qsmask & expmask. */
142 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 134 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 188 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 189 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 190#endif /* #ifdef CONFIG_RCU_BOOST */
191#ifdef CONFIG_RCU_NOCB_CPU
192 wait_queue_head_t nocb_gp_wq[2];
193 /* Place for rcu_nocb_kthread() to wait GP. */
194#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
195 int need_future_gp[2];
196 /* Counts of upcoming no-CB GP requests. */
199 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 197 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200} ____cacheline_internodealigned_in_smp; 198} ____cacheline_internodealigned_in_smp;
201 199
@@ -328,6 +326,11 @@ struct rcu_data {
328 struct task_struct *nocb_kthread; 326 struct task_struct *nocb_kthread;
329#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 327#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330 328
329 /* 8) RCU CPU stall data. */
330#ifdef CONFIG_RCU_CPU_STALL_INFO
331 unsigned int softirq_snap; /* Snapshot of softirq activity. */
332#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
333
331 int cpu; 334 int cpu;
332 struct rcu_state *rsp; 335 struct rcu_state *rsp;
333}; 336};
@@ -375,12 +378,6 @@ struct rcu_state {
375 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 378 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
376 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 379 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
377 void (*func)(struct rcu_head *head)); 380 void (*func)(struct rcu_head *head));
378#ifdef CONFIG_RCU_NOCB_CPU
379 void (*call_remote)(struct rcu_head *head,
380 void (*func)(struct rcu_head *head));
381 /* call_rcu() flavor, but for */
382 /* placing on remote CPU. */
383#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
384 381
385 /* The following fields are guarded by the root rcu_node's lock. */ 382 /* The following fields are guarded by the root rcu_node's lock. */
386 383
@@ -443,6 +440,7 @@ struct rcu_state {
443 unsigned long gp_max; /* Maximum GP duration in */ 440 unsigned long gp_max; /* Maximum GP duration in */
444 /* jiffies. */ 441 /* jiffies. */
445 char *name; /* Name of structure. */ 442 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */
446 struct list_head flavors; /* List of RCU flavors. */ 444 struct list_head flavors; /* List of RCU flavors. */
447}; 445};
448 446
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
520 struct rcu_node *rnp); 518 struct rcu_node *rnp);
521#endif /* #ifdef CONFIG_RCU_BOOST */ 519#endif /* #ifdef CONFIG_RCU_BOOST */
522static void __cpuinit rcu_prepare_kthreads(int cpu); 520static void __cpuinit rcu_prepare_kthreads(int cpu);
523static void rcu_prepare_for_idle_init(int cpu);
524static void rcu_cleanup_after_idle(int cpu); 521static void rcu_cleanup_after_idle(int cpu);
525static void rcu_prepare_for_idle(int cpu); 522static void rcu_prepare_for_idle(int cpu);
526static void rcu_idle_count_callbacks_posted(void); 523static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
529static void print_cpu_stall_info_end(void); 526static void print_cpu_stall_info_end(void);
530static void zero_cpu_stall_ticks(struct rcu_data *rdp); 527static void zero_cpu_stall_ticks(struct rcu_data *rdp);
531static void increment_cpu_stall_ticks(void); 528static void increment_cpu_stall_ticks(void);
532static bool is_nocb_cpu(int cpu); 529static int rcu_nocb_needs_gp(struct rcu_state *rsp);
530static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
531static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
532static void rcu_init_one_nocb(struct rcu_node *rnp);
533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
534 bool lazy); 534 bool lazy);
535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536 struct rcu_data *rdp); 536 struct rcu_data *rdp);
537static bool nocb_cpu_expendable(int cpu);
538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 537static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 538static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540static void init_nocb_callback_list(struct rcu_data *rdp); 539static void rcu_kick_nohz_cpu(int cpu);
541static void __init rcu_init_nocb(void); 540static bool init_nocb_callback_list(struct rcu_data *rdp);
542 541
543#endif /* #ifndef RCU_TREE_NONCORE */ 542#endif /* #ifndef RCU_TREE_NONCORE */
544 543
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..3db5a375d8dd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h>
31 32
32#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
33 34
@@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void)
85 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87#ifdef CONFIG_RCU_NOCB_CPU 88#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) {
91 zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
92 have_rcu_nocb_mask = true;
93 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tExperimental no-CBs CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tExperimental no-CBs for all CPUs\n");
100 cpumask_setall(rcu_nocb_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
88 if (have_rcu_nocb_mask) { 103 if (have_rcu_nocb_mask) {
89 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
90 cpumask_clear_cpu(0, rcu_nocb_mask);
91 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
92 }
93 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 104 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
94 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 105 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
95 if (rcu_nocb_poll) 106 if (rcu_nocb_poll)
@@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void)
101#ifdef CONFIG_TREE_PREEMPT_RCU 112#ifdef CONFIG_TREE_PREEMPT_RCU
102 113
103struct rcu_state rcu_preempt_state = 114struct rcu_state rcu_preempt_state =
104 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); 115 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
105DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 116DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
106static struct rcu_state *rcu_state = &rcu_preempt_state; 117static struct rcu_state *rcu_state = &rcu_preempt_state;
107 118
@@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1533int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1544int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1534{ 1545{
1535 *delta_jiffies = ULONG_MAX; 1546 *delta_jiffies = ULONG_MAX;
1536 return rcu_cpu_has_callbacks(cpu); 1547 return rcu_cpu_has_callbacks(cpu, NULL);
1537}
1538
1539/*
1540 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1541 */
1542static void rcu_prepare_for_idle_init(int cpu)
1543{
1544} 1548}
1545 1549
1546/* 1550/*
@@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void)
1577 * 1581 *
1578 * The following three proprocessor symbols control this state machine: 1582 * The following three proprocessor symbols control this state machine:
1579 * 1583 *
1580 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
1581 * to satisfy RCU. Beyond this point, it is better to incur a periodic
1582 * scheduling-clock interrupt than to loop through the state machine
1583 * at full power.
1584 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
1585 * optional if RCU does not need anything immediately from this
1586 * CPU, even if this CPU still has RCU callbacks queued. The first
1587 * times through the state machine are mandatory: we need to give
1588 * the state machine a chance to communicate a quiescent state
1589 * to the RCU core.
1590 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1584 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1591 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1585 * to sleep in dyntick-idle mode with RCU callbacks pending. This
1592 * is sized to be roughly one RCU grace period. Those energy-efficiency 1586 * is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void)
1602 * adjustment, they can be converted into kernel config parameters, though 1596 * adjustment, they can be converted into kernel config parameters, though
1603 * making the state machine smarter might be a better option. 1597 * making the state machine smarter might be a better option.
1604 */ 1598 */
1605#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1606#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1607#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1599#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1608#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1600#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1609 1601
1610extern int tick_nohz_enabled; 1602static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1611 1603module_param(rcu_idle_gp_delay, int, 0644);
1612/* 1604static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1613 * Does the specified flavor of RCU have non-lazy callbacks pending on 1605module_param(rcu_idle_lazy_gp_delay, int, 0644);
1614 * the specified CPU? Both RCU flavor and CPU are specified by the
1615 * rcu_data structure.
1616 */
1617static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
1618{
1619 return rdp->qlen != rdp->qlen_lazy;
1620}
1621 1606
1622#ifdef CONFIG_TREE_PREEMPT_RCU 1607extern int tick_nohz_enabled;
1623 1608
1624/* 1609/*
1625 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there 1610 * Try to advance callbacks for all flavors of RCU on the current CPU.
1626 * is no RCU-preempt in the kernel.) 1611 * Afterwards, if there are any callbacks ready for immediate invocation,
1612 * return true.
1627 */ 1613 */
1628static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1614static bool rcu_try_advance_all_cbs(void)
1629{ 1615{
1630 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 1616 bool cbs_ready = false;
1631 1617 struct rcu_data *rdp;
1632 return __rcu_cpu_has_nonlazy_callbacks(rdp); 1618 struct rcu_node *rnp;
1633} 1619 struct rcu_state *rsp;
1634
1635#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1636 1620
1637static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1621 for_each_rcu_flavor(rsp) {
1638{ 1622 rdp = this_cpu_ptr(rsp->rda);
1639 return 0; 1623 rnp = rdp->mynode;
1640}
1641 1624
1642#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1625 /*
1626 * Don't bother checking unless a grace period has
1627 * completed since we last checked and there are
1628 * callbacks not yet ready to invoke.
1629 */
1630 if (rdp->completed != rnp->completed &&
1631 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1632 rcu_process_gp_end(rsp, rdp);
1643 1633
1644/* 1634 if (cpu_has_callbacks_ready_to_invoke(rdp))
1645 * Does any flavor of RCU have non-lazy callbacks on the specified CPU? 1635 cbs_ready = true;
1646 */ 1636 }
1647static bool rcu_cpu_has_nonlazy_callbacks(int cpu) 1637 return cbs_ready;
1648{
1649 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1650 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1651 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1652} 1638}
1653 1639
1654/* 1640/*
1655 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1641 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1656 * callbacks on this CPU, (2) this CPU has not yet attempted to enter 1642 * to invoke. If the CPU has callbacks, try to advance them. Tell the
1657 * dyntick-idle mode, or (3) this CPU is in the process of attempting to 1643 * caller to set the timeout based on whether or not there are non-lazy
1658 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed 1644 * callbacks.
1659 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1660 * it is better to incur scheduling-clock interrupts than to spin
1661 * continuously for the same time duration!
1662 * 1645 *
1663 * The delta_jiffies argument is used to store the time when RCU is 1646 * The caller must have disabled interrupts.
1664 * going to need the CPU again if it still has callbacks. The reason
1665 * for this is that rcu_prepare_for_idle() might need to post a timer,
1666 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1667 * the wakeup time for this CPU. This means that RCU's timer can be
1668 * delayed until the wakeup time, which defeats the purpose of posting
1669 * a timer.
1670 */ 1647 */
1671int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1648int rcu_needs_cpu(int cpu, unsigned long *dj)
1672{ 1649{
1673 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1650 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1674 1651
1675 /* Flag a new idle sojourn to the idle-entry state machine. */ 1652 /* Snapshot to detect later posting of non-lazy callback. */
1676 rdtp->idle_first_pass = 1; 1653 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1654
1677 /* If no callbacks, RCU doesn't need the CPU. */ 1655 /* If no callbacks, RCU doesn't need the CPU. */
1678 if (!rcu_cpu_has_callbacks(cpu)) { 1656 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
1679 *delta_jiffies = ULONG_MAX; 1657 *dj = ULONG_MAX;
1680 return 0; 1658 return 0;
1681 } 1659 }
1682 if (rdtp->dyntick_holdoff == jiffies) { 1660
1683 /* RCU recently tried and failed, so don't try again. */ 1661 /* Attempt to advance callbacks. */
1684 *delta_jiffies = 1; 1662 if (rcu_try_advance_all_cbs()) {
1663 /* Some ready to invoke, so initiate later invocation. */
1664 invoke_rcu_core();
1685 return 1; 1665 return 1;
1686 } 1666 }
1687 /* Set up for the possibility that RCU will post a timer. */ 1667 rdtp->last_accelerate = jiffies;
1688 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 1668
1689 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, 1669 /* Request timer delay depending on laziness, and round. */
1690 RCU_IDLE_GP_DELAY) - jiffies; 1670 if (!rdtp->all_lazy) {
1671 *dj = round_up(rcu_idle_gp_delay + jiffies,
1672 rcu_idle_gp_delay) - jiffies;
1691 } else { 1673 } else {
1692 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; 1674 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1693 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1694 } 1675 }
1695 return 0; 1676 return 0;
1696} 1677}
1697 1678
1698/* 1679/*
1699 * Handler for smp_call_function_single(). The only point of this 1680 * Prepare a CPU for idle from an RCU perspective. The first major task
1700 * handler is to wake the CPU up, so the handler does only tracing. 1681 * is to sense whether nohz mode has been enabled or disabled via sysfs.
1701 */ 1682 * The second major task is to check to see if a non-lazy callback has
1702void rcu_idle_demigrate(void *unused) 1683 * arrived at a CPU that previously had only lazy callbacks. The third
1703{ 1684 * major task is to accelerate (that is, assign grace-period numbers to)
1704 trace_rcu_prep_idle("Demigrate"); 1685 * any recently arrived callbacks.
1705}
1706
1707/*
1708 * Timer handler used to force CPU to start pushing its remaining RCU
1709 * callbacks in the case where it entered dyntick-idle mode with callbacks
1710 * pending. The hander doesn't really need to do anything because the
1711 * real work is done upon re-entry to idle, or by the next scheduling-clock
1712 * interrupt should idle not be re-entered.
1713 *
1714 * One special case: the timer gets migrated without awakening the CPU
1715 * on which the timer was scheduled on. In this case, we must wake up
1716 * that CPU. We do so with smp_call_function_single().
1717 */
1718static void rcu_idle_gp_timer_func(unsigned long cpu_in)
1719{
1720 int cpu = (int)cpu_in;
1721
1722 trace_rcu_prep_idle("Timer");
1723 if (cpu != smp_processor_id())
1724 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1725 else
1726 WARN_ON_ONCE(1); /* Getting here can hang the system... */
1727}
1728
1729/*
1730 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
1731 */
1732static void rcu_prepare_for_idle_init(int cpu)
1733{
1734 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1735
1736 rdtp->dyntick_holdoff = jiffies - 1;
1737 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1738 rdtp->idle_gp_timer_expires = jiffies - 1;
1739 rdtp->idle_first_pass = 1;
1740}
1741
1742/*
1743 * Clean up for exit from idle. Because we are exiting from idle, there
1744 * is no longer any point to ->idle_gp_timer, so cancel it. This will
1745 * do nothing if this timer is not active, so just cancel it unconditionally.
1746 */
1747static void rcu_cleanup_after_idle(int cpu)
1748{
1749 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1750
1751 del_timer(&rdtp->idle_gp_timer);
1752 trace_rcu_prep_idle("Cleanup after idle");
1753 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
1754}
1755
1756/*
1757 * Check to see if any RCU-related work can be done by the current CPU,
1758 * and if so, schedule a softirq to get it done. This function is part
1759 * of the RCU implementation; it is -not- an exported member of the RCU API.
1760 *
1761 * The idea is for the current CPU to clear out all work required by the
1762 * RCU core for the current grace period, so that this CPU can be permitted
1763 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1764 * at the end of the grace period by whatever CPU ends the grace period.
1765 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
1766 * number of wakeups by a modest integer factor.
1767 *
1768 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1769 * disabled, we do one pass of force_quiescent_state(), then do a
1770 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1771 * later. The ->dyntick_drain field controls the sequencing.
1772 * 1686 *
1773 * The caller must have disabled interrupts. 1687 * The caller must have disabled interrupts.
1774 */ 1688 */
1775static void rcu_prepare_for_idle(int cpu) 1689static void rcu_prepare_for_idle(int cpu)
1776{ 1690{
1777 struct timer_list *tp; 1691 struct rcu_data *rdp;
1778 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1692 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1693 struct rcu_node *rnp;
1694 struct rcu_state *rsp;
1779 int tne; 1695 int tne;
1780 1696
1781 /* Handle nohz enablement switches conservatively. */ 1697 /* Handle nohz enablement switches conservatively. */
1782 tne = ACCESS_ONCE(tick_nohz_enabled); 1698 tne = ACCESS_ONCE(tick_nohz_enabled);
1783 if (tne != rdtp->tick_nohz_enabled_snap) { 1699 if (tne != rdtp->tick_nohz_enabled_snap) {
1784 if (rcu_cpu_has_callbacks(cpu)) 1700 if (rcu_cpu_has_callbacks(cpu, NULL))
1785 invoke_rcu_core(); /* force nohz to see update. */ 1701 invoke_rcu_core(); /* force nohz to see update. */
1786 rdtp->tick_nohz_enabled_snap = tne; 1702 rdtp->tick_nohz_enabled_snap = tne;
1787 return; 1703 return;
@@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu)
1789 if (!tne) 1705 if (!tne)
1790 return; 1706 return;
1791 1707
1792 /* Adaptive-tick mode, where usermode execution is idle to RCU. */ 1708 /* If this is a no-CBs CPU, no callbacks, just return. */
1793 if (!is_idle_task(current)) { 1709 if (rcu_is_nocb_cpu(cpu))
1794 rdtp->dyntick_holdoff = jiffies - 1;
1795 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1796 trace_rcu_prep_idle("User dyntick with callbacks");
1797 rdtp->idle_gp_timer_expires =
1798 round_up(jiffies + RCU_IDLE_GP_DELAY,
1799 RCU_IDLE_GP_DELAY);
1800 } else if (rcu_cpu_has_callbacks(cpu)) {
1801 rdtp->idle_gp_timer_expires =
1802 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1803 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1804 } else {
1805 return;
1806 }
1807 tp = &rdtp->idle_gp_timer;
1808 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1809 return; 1710 return;
1810 }
1811 1711
1812 /* 1712 /*
1813 * If this is an idle re-entry, for example, due to use of 1713 * If a non-lazy callback arrived at a CPU having only lazy
1814 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 1714 * callbacks, invoke RCU core for the side-effect of recalculating
1815 * loop, then don't take any state-machine actions, unless the 1715 * idle duration on re-entry to idle.
1816 * momentary exit from idle queued additional non-lazy callbacks.
1817 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1818 * pending.
1819 */ 1716 */
1820 if (!rdtp->idle_first_pass && 1717 if (rdtp->all_lazy &&
1821 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { 1718 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1822 if (rcu_cpu_has_callbacks(cpu)) { 1719 invoke_rcu_core();
1823 tp = &rdtp->idle_gp_timer;
1824 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1825 }
1826 return; 1720 return;
1827 } 1721 }
1828 rdtp->idle_first_pass = 0;
1829 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1830 1722
1831 /* 1723 /*
1832 * If there are no callbacks on this CPU, enter dyntick-idle mode. 1724 * If we have not yet accelerated this jiffy, accelerate all
1833 * Also reset state to avoid prejudicing later attempts. 1725 * callbacks on this CPU.
1834 */ 1726 */
1835 if (!rcu_cpu_has_callbacks(cpu)) { 1727 if (rdtp->last_accelerate == jiffies)
1836 rdtp->dyntick_holdoff = jiffies - 1;
1837 rdtp->dyntick_drain = 0;
1838 trace_rcu_prep_idle("No callbacks");
1839 return; 1728 return;
1729 rdtp->last_accelerate = jiffies;
1730 for_each_rcu_flavor(rsp) {
1731 rdp = per_cpu_ptr(rsp->rda, cpu);
1732 if (!*rdp->nxttail[RCU_DONE_TAIL])
1733 continue;
1734 rnp = rdp->mynode;
1735 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1736 rcu_accelerate_cbs(rsp, rnp, rdp);
1737 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1840 } 1738 }
1739}
1841 1740
1842 /* 1741/*
1843 * If in holdoff mode, just return. We will presumably have 1742 * Clean up for exit from idle. Attempt to advance callbacks based on
1844 * refrained from disabling the scheduling-clock tick. 1743 * any grace periods that elapsed while the CPU was idle, and if any
1845 */ 1744 * callbacks are now ready to invoke, initiate invocation.
1846 if (rdtp->dyntick_holdoff == jiffies) { 1745 */
1847 trace_rcu_prep_idle("In holdoff"); 1746static void rcu_cleanup_after_idle(int cpu)
1848 return; 1747{
1849 } 1748 struct rcu_data *rdp;
1749 struct rcu_state *rsp;
1850 1750
1851 /* Check and update the ->dyntick_drain sequencing. */ 1751 if (rcu_is_nocb_cpu(cpu))
1852 if (rdtp->dyntick_drain <= 0) {
1853 /* First time through, initialize the counter. */
1854 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
1855 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
1856 !rcu_pending(cpu) &&
1857 !local_softirq_pending()) {
1858 /* Can we go dyntick-idle despite still having callbacks? */
1859 rdtp->dyntick_drain = 0;
1860 rdtp->dyntick_holdoff = jiffies;
1861 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1862 trace_rcu_prep_idle("Dyntick with callbacks");
1863 rdtp->idle_gp_timer_expires =
1864 round_up(jiffies + RCU_IDLE_GP_DELAY,
1865 RCU_IDLE_GP_DELAY);
1866 } else {
1867 rdtp->idle_gp_timer_expires =
1868 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1869 trace_rcu_prep_idle("Dyntick with lazy callbacks");
1870 }
1871 tp = &rdtp->idle_gp_timer;
1872 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1873 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1874 return; /* Nothing more to do immediately. */
1875 } else if (--(rdtp->dyntick_drain) <= 0) {
1876 /* We have hit the limit, so time to give up. */
1877 rdtp->dyntick_holdoff = jiffies;
1878 trace_rcu_prep_idle("Begin holdoff");
1879 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
1880 return; 1752 return;
1881 } 1753 rcu_try_advance_all_cbs();
1882 1754 for_each_rcu_flavor(rsp) {
1883 /* 1755 rdp = per_cpu_ptr(rsp->rda, cpu);
1884 * Do one step of pushing the remaining RCU callbacks through 1756 if (cpu_has_callbacks_ready_to_invoke(rdp))
1885 * the RCU core state machine. 1757 invoke_rcu_core();
1886 */
1887#ifdef CONFIG_TREE_PREEMPT_RCU
1888 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1889 rcu_preempt_qs(cpu);
1890 force_quiescent_state(&rcu_preempt_state);
1891 }
1892#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1893 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1894 rcu_sched_qs(cpu);
1895 force_quiescent_state(&rcu_sched_state);
1896 }
1897 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1898 rcu_bh_qs(cpu);
1899 force_quiescent_state(&rcu_bh_state);
1900 }
1901
1902 /*
1903 * If RCU callbacks are still pending, RCU still needs this CPU.
1904 * So try forcing the callbacks through the grace period.
1905 */
1906 if (rcu_cpu_has_callbacks(cpu)) {
1907 trace_rcu_prep_idle("More callbacks");
1908 invoke_rcu_core();
1909 } else {
1910 trace_rcu_prep_idle("Callbacks drained");
1911 } 1758 }
1912} 1759}
1913 1760
@@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier);
2015static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1862static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2016{ 1863{
2017 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1864 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2018 struct timer_list *tltp = &rdtp->idle_gp_timer; 1865 unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
2019 char c;
2020 1866
2021 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; 1867 sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
2022 if (timer_pending(tltp)) 1868 rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
2023 sprintf(cp, "drain=%d %c timer=%lu", 1869 ulong2long(nlpd),
2024 rdtp->dyntick_drain, c, tltp->expires - jiffies); 1870 rdtp->all_lazy ? 'L' : '.',
2025 else 1871 rdtp->tick_nohz_enabled_snap ? '.' : 'D');
2026 sprintf(cp, "drain=%d %c timer not pending",
2027 rdtp->dyntick_drain, c);
2028} 1872}
2029 1873
2030#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1874#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2070 ticks_value = rsp->gpnum - rdp->gpnum; 1914 ticks_value = rsp->gpnum - rdp->gpnum;
2071 } 1915 }
2072 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1916 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2073 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", 1917 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
2074 cpu, ticks_value, ticks_title, 1918 cpu, ticks_value, ticks_title,
2075 atomic_read(&rdtp->dynticks) & 0xfff, 1919 atomic_read(&rdtp->dynticks) & 0xfff,
2076 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1920 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1921 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
2077 fast_no_hz); 1922 fast_no_hz);
2078} 1923}
2079 1924
@@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void)
2087static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1932static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2088{ 1933{
2089 rdp->ticks_this_gp = 0; 1934 rdp->ticks_this_gp = 0;
1935 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
2090} 1936}
2091 1937
2092/* Increment ->ticks_this_gp for all flavors of RCU. */ 1938/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg)
2165} 2011}
2166early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2012early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167 2013
2014/*
2015 * Do any no-CBs CPUs need another grace period?
2016 *
2017 * Interrupts must be disabled. If the caller does not hold the root
2018 * rnp_node structure's ->lock, the results are advisory only.
2019 */
2020static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2021{
2022 struct rcu_node *rnp = rcu_get_root(rsp);
2023
2024 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2025}
2026
2027/*
2028 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2029 * grace period.
2030 */
2031static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2032{
2033 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
2034}
2035
2036/*
2037 * Set the root rcu_node structure's ->need_future_gp field
2038 * based on the sum of those of all rcu_node structures. This does
2039 * double-count the root rcu_node structure's requests, but this
2040 * is necessary to handle the possibility of a rcu_nocb_kthread()
2041 * having awakened during the time that the rcu_node structures
2042 * were being updated for the end of the previous grace period.
2043 */
2044static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2045{
2046 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
2047}
2048
2049static void rcu_init_one_nocb(struct rcu_node *rnp)
2050{
2051 init_waitqueue_head(&rnp->nocb_gp_wq[0]);
2052 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2053}
2054
2168/* Is the specified CPU a no-CPUs CPU? */ 2055/* Is the specified CPU a no-CPUs CPU? */
2169static bool is_nocb_cpu(int cpu) 2056bool rcu_is_nocb_cpu(int cpu)
2170{ 2057{
2171 if (have_rcu_nocb_mask) 2058 if (have_rcu_nocb_mask)
2172 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2059 return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2224 bool lazy) 2111 bool lazy)
2225{ 2112{
2226 2113
2227 if (!is_nocb_cpu(rdp->cpu)) 2114 if (!rcu_is_nocb_cpu(rdp->cpu))
2228 return 0; 2115 return 0;
2229 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2116 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2117 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2118 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2119 (unsigned long)rhp->func,
2120 rdp->qlen_lazy, rdp->qlen);
2121 else
2122 trace_rcu_callback(rdp->rsp->name, rhp,
2123 rdp->qlen_lazy, rdp->qlen);
2230 return 1; 2124 return 1;
2231} 2125}
2232 2126
@@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2241 long qll = rsp->qlen_lazy; 2135 long qll = rsp->qlen_lazy;
2242 2136
2243 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2137 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2244 if (!is_nocb_cpu(smp_processor_id())) 2138 if (!rcu_is_nocb_cpu(smp_processor_id()))
2245 return 0; 2139 return 0;
2246 rsp->qlen = 0; 2140 rsp->qlen = 0;
2247 rsp->qlen_lazy = 0; 2141 rsp->qlen_lazy = 0;
@@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2265} 2159}
2266 2160
2267/* 2161/*
2268 * There must be at least one non-no-CBs CPU in operation at any given 2162 * If necessary, kick off a new grace period, and either way wait
2269 * time, because no-CBs CPUs are not capable of initiating grace periods 2163 * for a subsequent grace period to complete.
2270 * independently. This function therefore complains if the specified
2271 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2272 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2273 * but you have to have a base case!)
2274 */ 2164 */
2275static bool nocb_cpu_expendable(int cpu) 2165static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2276{ 2166{
2277 cpumask_var_t non_nocb_cpus; 2167 unsigned long c;
2278 int ret; 2168 bool d;
2169 unsigned long flags;
2170 struct rcu_node *rnp = rdp->mynode;
2171
2172 raw_spin_lock_irqsave(&rnp->lock, flags);
2173 c = rcu_start_future_gp(rnp, rdp);
2174 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2279 2175
2280 /* 2176 /*
2281 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, 2177 * Wait for the grace period. Do so interruptibly to avoid messing
2282 * then offlining this CPU is harmless. Let it happen. 2178 * up the load average.
2283 */ 2179 */
2284 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) 2180 trace_rcu_future_gp(rnp, rdp, c, "StartWait");
2285 return 1; 2181 for (;;) {
2286 2182 wait_event_interruptible(
2287 /* If no memory, play it safe and keep the CPU around. */ 2183 rnp->nocb_gp_wq[c & 0x1],
2288 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) 2184 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2289 return 0; 2185 if (likely(d))
2290 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); 2186 break;
2291 cpumask_clear_cpu(cpu, non_nocb_cpus); 2187 flush_signals(current);
2292 ret = !cpumask_empty(non_nocb_cpus); 2188 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
2293 free_cpumask_var(non_nocb_cpus); 2189 }
2294 return ret; 2190 trace_rcu_future_gp(rnp, rdp, c, "EndWait");
2295} 2191 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2296
2297/*
2298 * Helper structure for remote registry of RCU callbacks.
2299 * This is needed for when a no-CBs CPU needs to start a grace period.
2300 * If it just invokes call_rcu(), the resulting callback will be queued,
2301 * which can result in deadlock.
2302 */
2303struct rcu_head_remote {
2304 struct rcu_head *rhp;
2305 call_rcu_func_t *crf;
2306 void (*func)(struct rcu_head *rhp);
2307};
2308
2309/*
2310 * Register a callback as specified by the rcu_head_remote struct.
2311 * This function is intended to be invoked via smp_call_function_single().
2312 */
2313static void call_rcu_local(void *arg)
2314{
2315 struct rcu_head_remote *rhrp =
2316 container_of(arg, struct rcu_head_remote, rhp);
2317
2318 rhrp->crf(rhrp->rhp, rhrp->func);
2319}
2320
2321/*
2322 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2323 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2324 * smp_call_function_single().
2325 */
2326static void invoke_crf_remote(struct rcu_head *rhp,
2327 void (*func)(struct rcu_head *rhp),
2328 call_rcu_func_t crf)
2329{
2330 struct rcu_head_remote rhr;
2331
2332 rhr.rhp = rhp;
2333 rhr.crf = crf;
2334 rhr.func = func;
2335 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2336}
2337
2338/*
2339 * Helper functions to be passed to wait_rcu_gp(), each of which
2340 * invokes invoke_crf_remote() to register a callback appropriately.
2341 */
2342static void __maybe_unused
2343call_rcu_preempt_remote(struct rcu_head *rhp,
2344 void (*func)(struct rcu_head *rhp))
2345{
2346 invoke_crf_remote(rhp, func, call_rcu);
2347}
2348static void call_rcu_bh_remote(struct rcu_head *rhp,
2349 void (*func)(struct rcu_head *rhp))
2350{
2351 invoke_crf_remote(rhp, func, call_rcu_bh);
2352}
2353static void call_rcu_sched_remote(struct rcu_head *rhp,
2354 void (*func)(struct rcu_head *rhp))
2355{
2356 invoke_crf_remote(rhp, func, call_rcu_sched);
2357} 2192}
2358 2193
2359/* 2194/*
@@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg)
2390 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); 2225 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2391 ACCESS_ONCE(rdp->nocb_p_count) += c; 2226 ACCESS_ONCE(rdp->nocb_p_count) += c;
2392 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; 2227 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2393 wait_rcu_gp(rdp->rsp->call_remote); 2228 rcu_nocb_wait_gp(rdp);
2394 2229
2395 /* Each pass through the following loop invokes a callback. */ 2230 /* Each pass through the following loop invokes a callback. */
2396 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2231 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2436 return; 2271 return;
2437 for_each_cpu(cpu, rcu_nocb_mask) { 2272 for_each_cpu(cpu, rcu_nocb_mask) {
2438 rdp = per_cpu_ptr(rsp->rda, cpu); 2273 rdp = per_cpu_ptr(rsp->rda, cpu);
2439 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); 2274 t = kthread_run(rcu_nocb_kthread, rdp,
2275 "rcuo%c/%d", rsp->abbr, cpu);
2440 BUG_ON(IS_ERR(t)); 2276 BUG_ON(IS_ERR(t));
2441 ACCESS_ONCE(rdp->nocb_kthread) = t; 2277 ACCESS_ONCE(rdp->nocb_kthread) = t;
2442 } 2278 }
2443} 2279}
2444 2280
2445/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2281/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2446static void init_nocb_callback_list(struct rcu_data *rdp) 2282static bool init_nocb_callback_list(struct rcu_data *rdp)
2447{ 2283{
2448 if (rcu_nocb_mask == NULL || 2284 if (rcu_nocb_mask == NULL ||
2449 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) 2285 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2450 return; 2286 return false;
2451 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2287 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2288 return true;
2452} 2289}
2453 2290
2454/* Initialize the ->call_remote fields in the rcu_state structures. */ 2291#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2455static void __init rcu_init_nocb(void) 2292
2293static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2456{ 2294{
2457#ifdef CONFIG_PREEMPT_RCU 2295 return 0;
2458 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2459#endif /* #ifdef CONFIG_PREEMPT_RCU */
2460 rcu_bh_state.call_remote = call_rcu_bh_remote;
2461 rcu_sched_state.call_remote = call_rcu_sched_remote;
2462} 2296}
2463 2297
2464#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2298static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2299{
2300}
2465 2301
2466static bool is_nocb_cpu(int cpu) 2302static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2303{
2304}
2305
2306static void rcu_init_one_nocb(struct rcu_node *rnp)
2467{ 2307{
2468 return false;
2469} 2308}
2470 2309
2471static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2310static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
@@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2480 return 0; 2319 return 0;
2481} 2320}
2482 2321
2483static bool nocb_cpu_expendable(int cpu)
2484{
2485 return 1;
2486}
2487
2488static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2322static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2489{ 2323{
2490} 2324}
@@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2493{ 2327{
2494} 2328}
2495 2329
2496static void init_nocb_callback_list(struct rcu_data *rdp) 2330static bool init_nocb_callback_list(struct rcu_data *rdp)
2497{ 2331{
2332 return false;
2498} 2333}
2499 2334
2500static void __init rcu_init_nocb(void) 2335#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2336
2337/*
2338 * An adaptive-ticks CPU can potentially execute in kernel mode for an
2339 * arbitrarily long period of time with the scheduling-clock tick turned
2340 * off. RCU will be paying attention to this CPU because it is in the
2341 * kernel, but the CPU cannot be guaranteed to be executing the RCU state
2342 * machine because the scheduling-clock tick has been disabled. Therefore,
2343 * if an adaptive-ticks CPU is failing to respond to the current grace
2344 * period and has not be idle from an RCU perspective, kick it.
2345 */
2346static void rcu_kick_nohz_cpu(int cpu)
2501{ 2347{
2348#ifdef CONFIG_NO_HZ_FULL
2349 if (tick_nohz_full_cpu(cpu))
2350 smp_send_reschedule(cpu);
2351#endif /* #ifdef CONFIG_NO_HZ_FULL */
2502} 2352}
2503
2504#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..cf6c17412932 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op) 50 const struct seq_operations *op)
53{ 51{
@@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = {
97 .open = rcubarrier_open, 95 .open = rcubarrier_open,
98 .read = seq_read, 96 .read = seq_read,
99 .llseek = no_llseek, 97 .llseek = no_llseek,
100 .release = seq_release, 98 .release = single_release,
101}; 99};
102 100
103#ifdef CONFIG_RCU_BOOST 101#ifdef CONFIG_RCU_BOOST
@@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = {
208 .open = rcuexp_open, 206 .open = rcuexp_open,
209 .read = seq_read, 207 .read = seq_read,
210 .llseek = no_llseek, 208 .llseek = no_llseek,
211 .release = seq_release, 209 .release = single_release,
212}; 210};
213 211
214#ifdef CONFIG_RCU_BOOST 212#ifdef CONFIG_RCU_BOOST
@@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = {
308 .open = rcuhier_open, 306 .open = rcuhier_open,
309 .read = seq_read, 307 .read = seq_read,
310 .llseek = no_llseek, 308 .llseek = no_llseek,
311 .release = seq_release, 309 .release = single_release,
312}; 310};
313 311
314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 312static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = {
350 .open = rcugp_open, 348 .open = rcugp_open,
351 .read = seq_read, 349 .read = seq_read,
352 .llseek = no_llseek, 350 .llseek = no_llseek,
353 .release = seq_release, 351 .release = single_release,
354}; 352};
355 353
356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 354static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
diff --git a/kernel/relay.c b/kernel/relay.c
index 01ab081ac53a..b91488ba2e5a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
234static void relay_remove_buf(struct kref *kref) 234static void relay_remove_buf(struct kref *kref)
235{ 235{
236 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 236 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
237 buf->chan->cb->remove_buf_file(buf->dentry);
238 relay_destroy_buf(buf); 237 relay_destroy_buf(buf);
239} 238}
240 239
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
484{ 483{
485 buf->finalized = 1; 484 buf->finalized = 1;
486 del_timer_sync(&buf->timer); 485 del_timer_sync(&buf->timer);
486 buf->chan->cb->remove_buf_file(buf->dentry);
487 kref_put(&buf->kref, relay_remove_buf); 487 kref_put(&buf->kref, relay_remove_buf);
488} 488}
489 489
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
588 chan->version = RELAYFS_CHANNEL_VERSION; 588 chan->version = RELAYFS_CHANNEL_VERSION;
589 chan->n_subbufs = n_subbufs; 589 chan->n_subbufs = n_subbufs;
590 chan->subbuf_size = subbuf_size; 590 chan->subbuf_size = subbuf_size;
591 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 591 chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
592 chan->parent = parent; 592 chan->parent = parent;
593 chan->private_data = private_data; 593 chan->private_data = private_data;
594 if (base_filename) { 594 if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
1099static int subbuf_read_actor(size_t read_start, 1099static int subbuf_read_actor(size_t read_start,
1100 struct rchan_buf *buf, 1100 struct rchan_buf *buf,
1101 size_t avail, 1101 size_t avail,
1102 read_descriptor_t *desc, 1102 read_descriptor_t *desc)
1103 read_actor_t actor)
1104{ 1103{
1105 void *from; 1104 void *from;
1106 int ret = 0; 1105 int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
1121typedef int (*subbuf_actor_t) (size_t read_start, 1120typedef int (*subbuf_actor_t) (size_t read_start,
1122 struct rchan_buf *buf, 1121 struct rchan_buf *buf,
1123 size_t avail, 1122 size_t avail,
1124 read_descriptor_t *desc, 1123 read_descriptor_t *desc);
1125 read_actor_t actor);
1126 1124
1127/* 1125/*
1128 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 1126 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
1129 */ 1127 */
1130static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, 1128static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1131 subbuf_actor_t subbuf_actor, 1129 subbuf_actor_t subbuf_actor,
1132 read_actor_t actor,
1133 read_descriptor_t *desc) 1130 read_descriptor_t *desc)
1134{ 1131{
1135 struct rchan_buf *buf = filp->private_data; 1132 struct rchan_buf *buf = filp->private_data;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1150 break; 1147 break;
1151 1148
1152 avail = min(desc->count, avail); 1149 avail = min(desc->count, avail);
1153 ret = subbuf_actor(read_start, buf, avail, desc, actor); 1150 ret = subbuf_actor(read_start, buf, avail, desc);
1154 if (desc->error < 0) 1151 if (desc->error < 0)
1155 break; 1152 break;
1156 1153
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
1174 desc.count = count; 1171 desc.count = count;
1175 desc.arg.buf = buffer; 1172 desc.arg.buf = buffer;
1176 desc.error = 0; 1173 desc.error = 0;
1177 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, 1174 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
1178 NULL, &desc);
1179} 1175}
1180 1176
1181static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) 1177static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h>
24#include <asm/io.h> 25#include <asm/io.h>
25 26
26 27
@@ -50,6 +51,14 @@ struct resource_constraint {
50 51
51static DEFINE_RWLOCK(resource_lock); 52static DEFINE_RWLOCK(resource_lock);
52 53
54/*
55 * For memory hotplug, there is no way to free resource entries allocated
56 * by boot mem after the system is up. So for reusing the resource entry
57 * we need to remember the resource.
58 */
59static struct resource *bootmem_resource_free;
60static DEFINE_SPINLOCK(bootmem_resource_lock);
61
53static void *r_next(struct seq_file *m, void *v, loff_t *pos) 62static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54{ 63{
55 struct resource *p = v; 64 struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
151 160
152#endif /* CONFIG_PROC_FS */ 161#endif /* CONFIG_PROC_FS */
153 162
163static void free_resource(struct resource *res)
164{
165 if (!res)
166 return;
167
168 if (!PageSlab(virt_to_head_page(res))) {
169 spin_lock(&bootmem_resource_lock);
170 res->sibling = bootmem_resource_free;
171 bootmem_resource_free = res;
172 spin_unlock(&bootmem_resource_lock);
173 } else {
174 kfree(res);
175 }
176}
177
178static struct resource *alloc_resource(gfp_t flags)
179{
180 struct resource *res = NULL;
181
182 spin_lock(&bootmem_resource_lock);
183 if (bootmem_resource_free) {
184 res = bootmem_resource_free;
185 bootmem_resource_free = res->sibling;
186 }
187 spin_unlock(&bootmem_resource_lock);
188
189 if (res)
190 memset(res, 0, sizeof(struct resource));
191 else
192 res = kzalloc(sizeof(struct resource), flags);
193
194 return res;
195}
196
154/* Return the conflict entry if you can't request it */ 197/* Return the conflict entry if you can't request it */
155static struct resource * __request_resource(struct resource *root, struct resource *new) 198static struct resource * __request_resource(struct resource *root, struct resource *new)
156{ 199{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
706 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
707} 750}
708 751
709/** 752static int __adjust_resource(struct resource *res, resource_size_t start,
710 * adjust_resource - modify a resource's start and size 753 resource_size_t size)
711 * @res: resource to modify
712 * @start: new start value
713 * @size: new size
714 *
715 * Given an existing resource, change its start and size to match the
716 * arguments. Returns 0 on success, -EBUSY if it can't fit.
717 * Existing children of the resource are assumed to be immutable.
718 */
719int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
720{ 754{
721 struct resource *tmp, *parent = res->parent; 755 struct resource *tmp, *parent = res->parent;
722 resource_size_t end = start + size - 1; 756 resource_size_t end = start + size - 1;
723 int result = -EBUSY; 757 int result = -EBUSY;
724 758
725 write_lock(&resource_lock);
726
727 if (!parent) 759 if (!parent)
728 goto skip; 760 goto skip;
729 761
@@ -751,6 +783,26 @@ skip:
751 result = 0; 783 result = 0;
752 784
753 out: 785 out:
786 return result;
787}
788
789/**
790 * adjust_resource - modify a resource's start and size
791 * @res: resource to modify
792 * @start: new start value
793 * @size: new size
794 *
795 * Given an existing resource, change its start and size to match the
796 * arguments. Returns 0 on success, -EBUSY if it can't fit.
797 * Existing children of the resource are assumed to be immutable.
798 */
799int adjust_resource(struct resource *res, resource_size_t start,
800 resource_size_t size)
801{
802 int result;
803
804 write_lock(&resource_lock);
805 result = __adjust_resource(res, start, size);
754 write_unlock(&resource_lock); 806 write_unlock(&resource_lock);
755 return result; 807 return result;
756} 808}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
762{ 814{
763 struct resource *parent = root; 815 struct resource *parent = root;
764 struct resource *conflict; 816 struct resource *conflict;
765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); 817 struct resource *res = alloc_resource(GFP_ATOMIC);
766 struct resource *next_res = NULL; 818 struct resource *next_res = NULL;
767 819
768 if (!res) 820 if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
787 /* conflict covered whole area */ 839 /* conflict covered whole area */
788 if (conflict->start <= res->start && 840 if (conflict->start <= res->start &&
789 conflict->end >= res->end) { 841 conflict->end >= res->end) {
790 kfree(res); 842 free_resource(res);
791 WARN_ON(next_res); 843 WARN_ON(next_res);
792 break; 844 break;
793 } 845 }
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
797 end = res->end; 849 end = res->end;
798 res->end = conflict->start - 1; 850 res->end = conflict->start - 1;
799 if (conflict->end < end) { 851 if (conflict->end < end) {
800 next_res = kzalloc(sizeof(*next_res), 852 next_res = alloc_resource(GFP_ATOMIC);
801 GFP_ATOMIC);
802 if (!next_res) { 853 if (!next_res) {
803 kfree(res); 854 free_resource(res);
804 break; 855 break;
805 } 856 }
806 next_res->name = name; 857 next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
890 const char *name, int flags) 941 const char *name, int flags)
891{ 942{
892 DECLARE_WAITQUEUE(wait, current); 943 DECLARE_WAITQUEUE(wait, current);
893 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 944 struct resource *res = alloc_resource(GFP_KERNEL);
894 945
895 if (!res) 946 if (!res)
896 return NULL; 947 return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
924 continue; 975 continue;
925 } 976 }
926 /* Uhhuh, that didn't work out.. */ 977 /* Uhhuh, that didn't work out.. */
927 kfree(res); 978 free_resource(res);
928 res = NULL; 979 res = NULL;
929 break; 980 break;
930 } 981 }
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
958 return -EBUSY; 1009 return -EBUSY;
959 1010
960 release_resource(res); 1011 release_resource(res);
961 kfree(res); 1012 free_resource(res);
962 return 0; 1013 return 0;
963} 1014}
964EXPORT_SYMBOL(__check_region); 1015EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
998 write_unlock(&resource_lock); 1049 write_unlock(&resource_lock);
999 if (res->flags & IORESOURCE_MUXED) 1050 if (res->flags & IORESOURCE_MUXED)
1000 wake_up(&muxed_resource_wait); 1051 wake_up(&muxed_resource_wait);
1001 kfree(res); 1052 free_resource(res);
1002 return; 1053 return;
1003 } 1054 }
1004 p = &res->sibling; 1055 p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
1012} 1063}
1013EXPORT_SYMBOL(__release_region); 1064EXPORT_SYMBOL(__release_region);
1014 1065
1066#ifdef CONFIG_MEMORY_HOTREMOVE
1067/**
1068 * release_mem_region_adjustable - release a previously reserved memory region
1069 * @parent: parent resource descriptor
1070 * @start: resource start address
1071 * @size: resource region size
1072 *
1073 * This interface is intended for memory hot-delete. The requested region
1074 * is released from a currently busy memory resource. The requested region
1075 * must either match exactly or fit into a single busy resource entry. In
1076 * the latter case, the remaining resource is adjusted accordingly.
1077 * Existing children of the busy memory resource must be immutable in the
1078 * request.
1079 *
1080 * Note:
1081 * - Additional release conditions, such as overlapping region, can be
1082 * supported after they are confirmed as valid cases.
1083 * - When a busy memory resource gets split into two entries, the code
1084 * assumes that all children remain in the lower address entry for
1085 * simplicity. Enhance this logic when necessary.
1086 */
1087int release_mem_region_adjustable(struct resource *parent,
1088 resource_size_t start, resource_size_t size)
1089{
1090 struct resource **p;
1091 struct resource *res;
1092 struct resource *new_res;
1093 resource_size_t end;
1094 int ret = -EINVAL;
1095
1096 end = start + size - 1;
1097 if ((start < parent->start) || (end > parent->end))
1098 return ret;
1099
1100 /* The alloc_resource() result gets checked later */
1101 new_res = alloc_resource(GFP_KERNEL);
1102
1103 p = &parent->child;
1104 write_lock(&resource_lock);
1105
1106 while ((res = *p)) {
1107 if (res->start >= end)
1108 break;
1109
1110 /* look for the next resource if it does not fit into */
1111 if (res->start > start || res->end < end) {
1112 p = &res->sibling;
1113 continue;
1114 }
1115
1116 if (!(res->flags & IORESOURCE_MEM))
1117 break;
1118
1119 if (!(res->flags & IORESOURCE_BUSY)) {
1120 p = &res->child;
1121 continue;
1122 }
1123
1124 /* found the target resource; let's adjust accordingly */
1125 if (res->start == start && res->end == end) {
1126 /* free the whole entry */
1127 *p = res->sibling;
1128 free_resource(res);
1129 ret = 0;
1130 } else if (res->start == start && res->end != end) {
1131 /* adjust the start */
1132 ret = __adjust_resource(res, end + 1,
1133 res->end - end);
1134 } else if (res->start != start && res->end == end) {
1135 /* adjust the end */
1136 ret = __adjust_resource(res, res->start,
1137 start - res->start);
1138 } else {
1139 /* split into two entries */
1140 if (!new_res) {
1141 ret = -ENOMEM;
1142 break;
1143 }
1144 new_res->name = res->name;
1145 new_res->start = end + 1;
1146 new_res->end = res->end;
1147 new_res->flags = res->flags;
1148 new_res->parent = res->parent;
1149 new_res->sibling = res->sibling;
1150 new_res->child = NULL;
1151
1152 ret = __adjust_resource(res, res->start,
1153 start - res->start);
1154 if (ret)
1155 break;
1156 res->sibling = new_res;
1157 new_res = NULL;
1158 }
1159
1160 break;
1161 }
1162
1163 write_unlock(&resource_lock);
1164 free_resource(new_res);
1165 return ret;
1166}
1167#endif /* CONFIG_MEMORY_HOTREMOVE */
1168
1015/* 1169/*
1016 * Managed region resource 1170 * Managed region resource
1017 */ 1171 */
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/stat.h>
17 18
18#include "rtmutex.h" 19#include "rtmutex.h"
19 20
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
366 return curr - buf; 367 return curr - buf;
367} 368}
368 369
369static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); 370static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
370static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); 371static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
371 372
372static struct bus_type rttest_subsys = { 373static struct bus_type rttest_subsys = {
373 .name = "rttest", 374 .name = "rttest",
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd847..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
126 126
127EXPORT_SYMBOL(_down_write_nest_lock); 127EXPORT_SYMBOL(_down_write_nest_lock);
128 128
129void down_read_non_owner(struct rw_semaphore *sem)
130{
131 might_sleep();
132
133 __down_read(sem);
134}
135
136EXPORT_SYMBOL(down_read_non_owner);
137
129void down_write_nested(struct rw_semaphore *sem, int subclass) 138void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 139{
131 might_sleep(); 140 might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
136 145
137EXPORT_SYMBOL(down_write_nested); 146EXPORT_SYMBOL(down_write_nested);
138 147
148void up_read_non_owner(struct rw_semaphore *sem)
149{
150 __up_read(sem);
151}
152
153EXPORT_SYMBOL(up_read_non_owner);
154
139#endif 155#endif
140 156
141 157
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
176 u64 this_clock, remote_clock; 176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val; 177 u64 *ptr, old_val, val;
178 178
179#if BITS_PER_LONG != 64
180again:
181 /*
182 * Careful here: The local and the remote clock values need to
183 * be read out atomic as we need to compare the values and
184 * then update either the local or the remote side. So the
185 * cmpxchg64 below only protects one readout.
186 *
187 * We must reread via sched_clock_local() in the retry case on
188 * 32bit as an NMI could use sched_clock_local() via the
189 * tracer and hit between the readout of
190 * the low32bit and the high 32bit portion.
191 */
192 this_clock = sched_clock_local(my_scd);
193 /*
194 * We must enforce atomic readout on 32bit, otherwise the
195 * update on the remote cpu can hit inbetween the readout of
196 * the low32bit and the high 32bit portion.
197 */
198 remote_clock = cmpxchg64(&scd->clock, 0, 0);
199#else
200 /*
201 * On 64bit the read of [my]scd->clock is atomic versus the
202 * update, so we can avoid the above 32bit dance.
203 */
179 sched_clock_local(my_scd); 204 sched_clock_local(my_scd);
180again: 205again:
181 this_clock = my_scd->clock; 206 this_clock = my_scd->clock;
182 remote_clock = scd->clock; 207 remote_clock = scd->clock;
208#endif
183 209
184 /* 210 /*
185 * Use the opportunity that we have both locks 211 * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..58453b8272fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
512 * the target CPU. 512 * the target CPU.
513 */ 513 */
514#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p) 515void resched_task(struct task_struct *p)
521{ 516{
522 int cpu; 517 int cpu;
@@ -549,7 +544,7 @@ void resched_cpu(int cpu)
549 raw_spin_unlock_irqrestore(&rq->lock, flags); 544 raw_spin_unlock_irqrestore(&rq->lock, flags);
550} 545}
551 546
552#ifdef CONFIG_NO_HZ 547#ifdef CONFIG_NO_HZ_COMMON
553/* 548/*
554 * In the semi idle case, use the nearest busy cpu for migrating timers 549 * In the semi idle case, use the nearest busy cpu for migrating timers
555 * from an idle cpu. This is good for power-savings. 550 * from an idle cpu. This is good for power-savings.
@@ -587,7 +582,7 @@ unlock:
587 * account when the CPU goes back to idle and evaluates the timer 582 * account when the CPU goes back to idle and evaluates the timer
588 * wheel for the next timer event. 583 * wheel for the next timer event.
589 */ 584 */
590void wake_up_idle_cpu(int cpu) 585static void wake_up_idle_cpu(int cpu)
591{ 586{
592 struct rq *rq = cpu_rq(cpu); 587 struct rq *rq = cpu_rq(cpu);
593 588
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
617 smp_send_reschedule(cpu); 612 smp_send_reschedule(cpu);
618} 613}
619 614
615static bool wake_up_full_nohz_cpu(int cpu)
616{
617 if (tick_nohz_full_cpu(cpu)) {
618 if (cpu != smp_processor_id() ||
619 tick_nohz_tick_stopped())
620 smp_send_reschedule(cpu);
621 return true;
622 }
623
624 return false;
625}
626
627void wake_up_nohz_cpu(int cpu)
628{
629 if (!wake_up_full_nohz_cpu(cpu))
630 wake_up_idle_cpu(cpu);
631}
632
620static inline bool got_nohz_idle_kick(void) 633static inline bool got_nohz_idle_kick(void)
621{ 634{
622 int cpu = smp_processor_id(); 635 int cpu = smp_processor_id();
623 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 636 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
624} 637}
625 638
626#else /* CONFIG_NO_HZ */ 639#else /* CONFIG_NO_HZ_COMMON */
627 640
628static inline bool got_nohz_idle_kick(void) 641static inline bool got_nohz_idle_kick(void)
629{ 642{
630 return false; 643 return false;
631} 644}
632 645
633#endif /* CONFIG_NO_HZ */ 646#endif /* CONFIG_NO_HZ_COMMON */
647
648#ifdef CONFIG_NO_HZ_FULL
649bool sched_can_stop_tick(void)
650{
651 struct rq *rq;
652
653 rq = this_rq();
654
655 /* Make sure rq->nr_running update is visible after the IPI */
656 smp_rmb();
657
658 /* More than one running task need preemption */
659 if (rq->nr_running > 1)
660 return false;
661
662 return true;
663}
664#endif /* CONFIG_NO_HZ_FULL */
634 665
635void sched_avg_update(struct rq *rq) 666void sched_avg_update(struct rq *rq)
636{ 667{
@@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1288static void 1319static void
1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1320ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1290{ 1321{
1291 trace_sched_wakeup(p, true);
1292 check_preempt_curr(rq, p, wake_flags); 1322 check_preempt_curr(rq, p, wake_flags);
1323 trace_sched_wakeup(p, true);
1293 1324
1294 p->state = TASK_RUNNING; 1325 p->state = TASK_RUNNING;
1295#ifdef CONFIG_SMP 1326#ifdef CONFIG_SMP
@@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void)
1362 1393
1363void scheduler_ipi(void) 1394void scheduler_ipi(void)
1364{ 1395{
1365 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1396 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
1397 && !tick_nohz_full_cpu(smp_processor_id()))
1366 return; 1398 return;
1367 1399
1368 /* 1400 /*
@@ -1379,6 +1411,7 @@ void scheduler_ipi(void)
1379 * somewhat pessimize the simple resched case. 1411 * somewhat pessimize the simple resched case.
1380 */ 1412 */
1381 irq_enter(); 1413 irq_enter();
1414 tick_nohz_full_check();
1382 sched_ttwu_pending(); 1415 sched_ttwu_pending();
1383 1416
1384 /* 1417 /*
@@ -1498,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
1498{ 1531{
1499 struct rq *rq = task_rq(p); 1532 struct rq *rq = task_rq(p);
1500 1533
1501 BUG_ON(rq != this_rq()); 1534 if (WARN_ON_ONCE(rq != this_rq()) ||
1502 BUG_ON(p == current); 1535 WARN_ON_ONCE(p == current))
1536 return;
1537
1503 lockdep_assert_held(&rq->lock); 1538 lockdep_assert_held(&rq->lock);
1504 1539
1505 if (!raw_spin_trylock(&p->pi_lock)) { 1540 if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1858,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1858 kprobe_flush_task(prev); 1893 kprobe_flush_task(prev);
1859 put_task_struct(prev); 1894 put_task_struct(prev);
1860 } 1895 }
1896
1897 tick_nohz_task_switch(current);
1861} 1898}
1862 1899
1863#ifdef CONFIG_SMP 1900#ifdef CONFIG_SMP
@@ -2121,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2121 return load >> FSHIFT; 2158 return load >> FSHIFT;
2122} 2159}
2123 2160
2124#ifdef CONFIG_NO_HZ 2161#ifdef CONFIG_NO_HZ_COMMON
2125/* 2162/*
2126 * Handle NO_HZ for the global load-average. 2163 * Handle NO_HZ for the global load-average.
2127 * 2164 *
@@ -2347,12 +2384,12 @@ static void calc_global_nohz(void)
2347 smp_wmb(); 2384 smp_wmb();
2348 calc_load_idx++; 2385 calc_load_idx++;
2349} 2386}
2350#else /* !CONFIG_NO_HZ */ 2387#else /* !CONFIG_NO_HZ_COMMON */
2351 2388
2352static inline long calc_load_fold_idle(void) { return 0; } 2389static inline long calc_load_fold_idle(void) { return 0; }
2353static inline void calc_global_nohz(void) { } 2390static inline void calc_global_nohz(void) { }
2354 2391
2355#endif /* CONFIG_NO_HZ */ 2392#endif /* CONFIG_NO_HZ_COMMON */
2356 2393
2357/* 2394/*
2358 * calc_load - update the avenrun load estimates 10 ticks after the 2395 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2512,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2512 sched_avg_update(this_rq); 2549 sched_avg_update(this_rq);
2513} 2550}
2514 2551
2515#ifdef CONFIG_NO_HZ 2552#ifdef CONFIG_NO_HZ_COMMON
2516/* 2553/*
2517 * There is no sane way to deal with nohz on smp when using jiffies because the 2554 * There is no sane way to deal with nohz on smp when using jiffies because the
2518 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 2555 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2572,7 +2609,7 @@ void update_cpu_load_nohz(void)
2572 } 2609 }
2573 raw_spin_unlock(&this_rq->lock); 2610 raw_spin_unlock(&this_rq->lock);
2574} 2611}
2575#endif /* CONFIG_NO_HZ */ 2612#endif /* CONFIG_NO_HZ_COMMON */
2576 2613
2577/* 2614/*
2578 * Called from scheduler_tick() 2615 * Called from scheduler_tick()
@@ -2699,8 +2736,35 @@ void scheduler_tick(void)
2699 rq->idle_balance = idle_cpu(cpu); 2736 rq->idle_balance = idle_cpu(cpu);
2700 trigger_load_balance(rq, cpu); 2737 trigger_load_balance(rq, cpu);
2701#endif 2738#endif
2739 rq_last_tick_reset(rq);
2702} 2740}
2703 2741
2742#ifdef CONFIG_NO_HZ_FULL
2743/**
2744 * scheduler_tick_max_deferment
2745 *
2746 * Keep at least one tick per second when a single
2747 * active task is running because the scheduler doesn't
2748 * yet completely support full dynticks environment.
2749 *
2750 * This makes sure that uptime, CFS vruntime, load
2751 * balancing, etc... continue to move forward, even
2752 * with a very low granularity.
2753 */
2754u64 scheduler_tick_max_deferment(void)
2755{
2756 struct rq *rq = this_rq();
2757 unsigned long next, now = ACCESS_ONCE(jiffies);
2758
2759 next = rq->last_sched_tick + HZ;
2760
2761 if (time_before_eq(next, now))
2762 return 0;
2763
2764 return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
2765}
2766#endif
2767
2704notrace unsigned long get_parent_ip(unsigned long addr) 2768notrace unsigned long get_parent_ip(unsigned long addr)
2705{ 2769{
2706 if (in_lock_functions(addr)) { 2770 if (in_lock_functions(addr)) {
@@ -2997,51 +3061,6 @@ void __sched schedule_preempt_disabled(void)
2997 preempt_disable(); 3061 preempt_disable();
2998} 3062}
2999 3063
3000#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3001
3002static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3003{
3004 if (lock->owner != owner)
3005 return false;
3006
3007 /*
3008 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3009 * lock->owner still matches owner, if that fails, owner might
3010 * point to free()d memory, if it still matches, the rcu_read_lock()
3011 * ensures the memory stays valid.
3012 */
3013 barrier();
3014
3015 return owner->on_cpu;
3016}
3017
3018/*
3019 * Look out! "owner" is an entirely speculative pointer
3020 * access and not reliable.
3021 */
3022int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3023{
3024 if (!sched_feat(OWNER_SPIN))
3025 return 0;
3026
3027 rcu_read_lock();
3028 while (owner_running(lock, owner)) {
3029 if (need_resched())
3030 break;
3031
3032 arch_mutex_cpu_relax();
3033 }
3034 rcu_read_unlock();
3035
3036 /*
3037 * We break out the loop above on need_resched() and when the
3038 * owner changed, which is a sign for heavy contention. Return
3039 * success only when lock->owner is NULL.
3040 */
3041 return lock->owner == NULL;
3042}
3043#endif
3044
3045#ifdef CONFIG_PREEMPT 3064#ifdef CONFIG_PREEMPT
3046/* 3065/*
3047 * this is the entry point to schedule() from in-kernel preemption 3066 * this is the entry point to schedule() from in-kernel preemption
@@ -3082,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule);
3082asmlinkage void __sched preempt_schedule_irq(void) 3101asmlinkage void __sched preempt_schedule_irq(void)
3083{ 3102{
3084 struct thread_info *ti = current_thread_info(); 3103 struct thread_info *ti = current_thread_info();
3104 enum ctx_state prev_state;
3085 3105
3086 /* Catch callers which need to be fixed */ 3106 /* Catch callers which need to be fixed */
3087 BUG_ON(ti->preempt_count || !irqs_disabled()); 3107 BUG_ON(ti->preempt_count || !irqs_disabled());
3088 3108
3089 user_exit(); 3109 prev_state = exception_enter();
3110
3090 do { 3111 do {
3091 add_preempt_count(PREEMPT_ACTIVE); 3112 add_preempt_count(PREEMPT_ACTIVE);
3092 local_irq_enable(); 3113 local_irq_enable();
@@ -3100,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
3100 */ 3121 */
3101 barrier(); 3122 barrier();
3102 } while (need_resched()); 3123 } while (need_resched());
3124
3125 exception_exit(prev_state);
3103} 3126}
3104 3127
3105#endif /* CONFIG_PREEMPT */ 3128#endif /* CONFIG_PREEMPT */
@@ -4126,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4126 get_task_struct(p); 4149 get_task_struct(p);
4127 rcu_read_unlock(); 4150 rcu_read_unlock();
4128 4151
4152 if (p->flags & PF_NO_SETAFFINITY) {
4153 retval = -EINVAL;
4154 goto out_put_task;
4155 }
4129 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4156 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4130 retval = -ENOMEM; 4157 retval = -ENOMEM;
4131 goto out_put_task; 4158 goto out_put_task;
@@ -4626,6 +4653,7 @@ void sched_show_task(struct task_struct *p)
4626 task_pid_nr(p), ppid, 4653 task_pid_nr(p), ppid,
4627 (unsigned long)task_thread_info(p)->flags); 4654 (unsigned long)task_thread_info(p)->flags);
4628 4655
4656 print_worker_info(KERN_INFO, p);
4629 show_stack(p, NULL); 4657 show_stack(p, NULL);
4630} 4658}
4631 4659
@@ -4773,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4773 goto out; 4801 goto out;
4774 } 4802 }
4775 4803
4776 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4777 ret = -EINVAL;
4778 goto out;
4779 }
4780
4781 do_set_cpus_allowed(p, new_mask); 4804 do_set_cpus_allowed(p, new_mask);
4782 4805
4783 /* Can the task run on the task's current CPU? If so, we're done */ 4806 /* Can the task run on the task's current CPU? If so, we're done */
@@ -4999,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
4999} 5022}
5000 5023
5001static int min_load_idx = 0; 5024static int min_load_idx = 0;
5002static int max_load_idx = CPU_LOAD_IDX_MAX; 5025static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5003 5026
5004static void 5027static void
5005set_table_entry(struct ctl_table *entry, 5028set_table_entry(struct ctl_table *entry,
@@ -6248,7 +6271,7 @@ static void sched_init_numa(void)
6248 * 'level' contains the number of unique distances, excluding the 6271 * 'level' contains the number of unique distances, excluding the
6249 * identity distance node_distance(i,i). 6272 * identity distance node_distance(i,i).
6250 * 6273 *
6251 * The sched_domains_nume_distance[] array includes the actual distance 6274 * The sched_domains_numa_distance[] array includes the actual distance
6252 * numbers. 6275 * numbers.
6253 */ 6276 */
6254 6277
@@ -6861,11 +6884,15 @@ int in_sched_functions(unsigned long addr)
6861} 6884}
6862 6885
6863#ifdef CONFIG_CGROUP_SCHED 6886#ifdef CONFIG_CGROUP_SCHED
6887/*
6888 * Default task group.
6889 * Every task in system belongs to this group at bootup.
6890 */
6864struct task_group root_task_group; 6891struct task_group root_task_group;
6865LIST_HEAD(task_groups); 6892LIST_HEAD(task_groups);
6866#endif 6893#endif
6867 6894
6868DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6895DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6869 6896
6870void __init sched_init(void) 6897void __init sched_init(void)
6871{ 6898{
@@ -6902,7 +6929,7 @@ void __init sched_init(void)
6902#endif /* CONFIG_RT_GROUP_SCHED */ 6929#endif /* CONFIG_RT_GROUP_SCHED */
6903#ifdef CONFIG_CPUMASK_OFFSTACK 6930#ifdef CONFIG_CPUMASK_OFFSTACK
6904 for_each_possible_cpu(i) { 6931 for_each_possible_cpu(i) {
6905 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6932 per_cpu(load_balance_mask, i) = (void *)ptr;
6906 ptr += cpumask_size(); 6933 ptr += cpumask_size();
6907 } 6934 }
6908#endif /* CONFIG_CPUMASK_OFFSTACK */ 6935#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6928,12 +6955,6 @@ void __init sched_init(void)
6928 6955
6929#endif /* CONFIG_CGROUP_SCHED */ 6956#endif /* CONFIG_CGROUP_SCHED */
6930 6957
6931#ifdef CONFIG_CGROUP_CPUACCT
6932 root_cpuacct.cpustat = &kernel_cpustat;
6933 root_cpuacct.cpuusage = alloc_percpu(u64);
6934 /* Too early, not expected to fail */
6935 BUG_ON(!root_cpuacct.cpuusage);
6936#endif
6937 for_each_possible_cpu(i) { 6958 for_each_possible_cpu(i) {
6938 struct rq *rq; 6959 struct rq *rq;
6939 6960
@@ -6997,9 +7018,12 @@ void __init sched_init(void)
6997 INIT_LIST_HEAD(&rq->cfs_tasks); 7018 INIT_LIST_HEAD(&rq->cfs_tasks);
6998 7019
6999 rq_attach_root(rq, &def_root_domain); 7020 rq_attach_root(rq, &def_root_domain);
7000#ifdef CONFIG_NO_HZ 7021#ifdef CONFIG_NO_HZ_COMMON
7001 rq->nohz_flags = 0; 7022 rq->nohz_flags = 0;
7002#endif 7023#endif
7024#ifdef CONFIG_NO_HZ_FULL
7025 rq->last_sched_tick = 0;
7026#endif
7003#endif 7027#endif
7004 init_rq_hrtick(rq); 7028 init_rq_hrtick(rq);
7005 atomic_set(&rq->nr_iowait, 0); 7029 atomic_set(&rq->nr_iowait, 0);
@@ -7455,7 +7479,7 @@ unlock:
7455 return err; 7479 return err;
7456} 7480}
7457 7481
7458int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7482static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7459{ 7483{
7460 u64 rt_runtime, rt_period; 7484 u64 rt_runtime, rt_period;
7461 7485
@@ -7467,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7467 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7491 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7468} 7492}
7469 7493
7470long sched_group_rt_runtime(struct task_group *tg) 7494static long sched_group_rt_runtime(struct task_group *tg)
7471{ 7495{
7472 u64 rt_runtime_us; 7496 u64 rt_runtime_us;
7473 7497
@@ -7479,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg)
7479 return rt_runtime_us; 7503 return rt_runtime_us;
7480} 7504}
7481 7505
7482int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7506static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7483{ 7507{
7484 u64 rt_runtime, rt_period; 7508 u64 rt_runtime, rt_period;
7485 7509
@@ -7492,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7492 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7516 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7493} 7517}
7494 7518
7495long sched_group_rt_period(struct task_group *tg) 7519static long sched_group_rt_period(struct task_group *tg)
7496{ 7520{
7497 u64 rt_period_us; 7521 u64 rt_period_us;
7498 7522
@@ -7527,7 +7551,7 @@ static int sched_rt_global_constraints(void)
7527 return ret; 7551 return ret;
7528} 7552}
7529 7553
7530int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7554static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7531{ 7555{
7532 /* Don't accept realtime tasks when there is no way for them to run */ 7556 /* Don't accept realtime tasks when there is no way for them to run */
7533 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7557 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -8035,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8035 8059
8036#endif /* CONFIG_CGROUP_SCHED */ 8060#endif /* CONFIG_CGROUP_SCHED */
8037 8061
8038#ifdef CONFIG_CGROUP_CPUACCT
8039
8040/*
8041 * CPU accounting code for task groups.
8042 *
8043 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8044 * (balbir@in.ibm.com).
8045 */
8046
8047struct cpuacct root_cpuacct;
8048
8049/* create a new cpu accounting group */
8050static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8051{
8052 struct cpuacct *ca;
8053
8054 if (!cgrp->parent)
8055 return &root_cpuacct.css;
8056
8057 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8058 if (!ca)
8059 goto out;
8060
8061 ca->cpuusage = alloc_percpu(u64);
8062 if (!ca->cpuusage)
8063 goto out_free_ca;
8064
8065 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8066 if (!ca->cpustat)
8067 goto out_free_cpuusage;
8068
8069 return &ca->css;
8070
8071out_free_cpuusage:
8072 free_percpu(ca->cpuusage);
8073out_free_ca:
8074 kfree(ca);
8075out:
8076 return ERR_PTR(-ENOMEM);
8077}
8078
8079/* destroy an existing cpu accounting group */
8080static void cpuacct_css_free(struct cgroup *cgrp)
8081{
8082 struct cpuacct *ca = cgroup_ca(cgrp);
8083
8084 free_percpu(ca->cpustat);
8085 free_percpu(ca->cpuusage);
8086 kfree(ca);
8087}
8088
8089static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8090{
8091 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8092 u64 data;
8093
8094#ifndef CONFIG_64BIT
8095 /*
8096 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8097 */
8098 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8099 data = *cpuusage;
8100 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8101#else
8102 data = *cpuusage;
8103#endif
8104
8105 return data;
8106}
8107
8108static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8109{
8110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8111
8112#ifndef CONFIG_64BIT
8113 /*
8114 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8115 */
8116 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8117 *cpuusage = val;
8118 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8119#else
8120 *cpuusage = val;
8121#endif
8122}
8123
8124/* return total cpu usage (in nanoseconds) of a group */
8125static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8126{
8127 struct cpuacct *ca = cgroup_ca(cgrp);
8128 u64 totalcpuusage = 0;
8129 int i;
8130
8131 for_each_present_cpu(i)
8132 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8133
8134 return totalcpuusage;
8135}
8136
8137static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8138 u64 reset)
8139{
8140 struct cpuacct *ca = cgroup_ca(cgrp);
8141 int err = 0;
8142 int i;
8143
8144 if (reset) {
8145 err = -EINVAL;
8146 goto out;
8147 }
8148
8149 for_each_present_cpu(i)
8150 cpuacct_cpuusage_write(ca, i, 0);
8151
8152out:
8153 return err;
8154}
8155
8156static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8157 struct seq_file *m)
8158{
8159 struct cpuacct *ca = cgroup_ca(cgroup);
8160 u64 percpu;
8161 int i;
8162
8163 for_each_present_cpu(i) {
8164 percpu = cpuacct_cpuusage_read(ca, i);
8165 seq_printf(m, "%llu ", (unsigned long long) percpu);
8166 }
8167 seq_printf(m, "\n");
8168 return 0;
8169}
8170
8171static const char *cpuacct_stat_desc[] = {
8172 [CPUACCT_STAT_USER] = "user",
8173 [CPUACCT_STAT_SYSTEM] = "system",
8174};
8175
8176static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8177 struct cgroup_map_cb *cb)
8178{
8179 struct cpuacct *ca = cgroup_ca(cgrp);
8180 int cpu;
8181 s64 val = 0;
8182
8183 for_each_online_cpu(cpu) {
8184 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8185 val += kcpustat->cpustat[CPUTIME_USER];
8186 val += kcpustat->cpustat[CPUTIME_NICE];
8187 }
8188 val = cputime64_to_clock_t(val);
8189 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8190
8191 val = 0;
8192 for_each_online_cpu(cpu) {
8193 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8194 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8195 val += kcpustat->cpustat[CPUTIME_IRQ];
8196 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8197 }
8198
8199 val = cputime64_to_clock_t(val);
8200 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8201
8202 return 0;
8203}
8204
8205static struct cftype files[] = {
8206 {
8207 .name = "usage",
8208 .read_u64 = cpuusage_read,
8209 .write_u64 = cpuusage_write,
8210 },
8211 {
8212 .name = "usage_percpu",
8213 .read_seq_string = cpuacct_percpu_seq_read,
8214 },
8215 {
8216 .name = "stat",
8217 .read_map = cpuacct_stats_show,
8218 },
8219 { } /* terminate */
8220};
8221
8222/*
8223 * charge this task's execution time to its accounting group.
8224 *
8225 * called with rq->lock held.
8226 */
8227void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8228{
8229 struct cpuacct *ca;
8230 int cpu;
8231
8232 if (unlikely(!cpuacct_subsys.active))
8233 return;
8234
8235 cpu = task_cpu(tsk);
8236
8237 rcu_read_lock();
8238
8239 ca = task_ca(tsk);
8240
8241 for (; ca; ca = parent_ca(ca)) {
8242 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8243 *cpuusage += cputime;
8244 }
8245
8246 rcu_read_unlock();
8247}
8248
8249struct cgroup_subsys cpuacct_subsys = {
8250 .name = "cpuacct",
8251 .css_alloc = cpuacct_css_alloc,
8252 .css_free = cpuacct_css_free,
8253 .subsys_id = cpuacct_subsys_id,
8254 .base_cftypes = files,
8255};
8256#endif /* CONFIG_CGROUP_CPUACCT */
8257
8258void dump_cpu_task(int cpu) 8062void dump_cpu_task(int cpu)
8259{ 8063{
8260 pr_info("Task dump for CPU %d:\n", cpu); 8064 pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
1#include <linux/cgroup.h>
2#include <linux/slab.h>
3#include <linux/percpu.h>
4#include <linux/spinlock.h>
5#include <linux/cpumask.h>
6#include <linux/seq_file.h>
7#include <linux/rcupdate.h>
8#include <linux/kernel_stat.h>
9#include <linux/err.h>
10
11#include "sched.h"
12
13/*
14 * CPU accounting code for task groups.
15 *
16 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
17 * (balbir@in.ibm.com).
18 */
19
20/* Time spent by the tasks of the cpu accounting group executing in ... */
21enum cpuacct_stat_index {
22 CPUACCT_STAT_USER, /* ... user mode */
23 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
24
25 CPUACCT_STAT_NSTATS,
26};
27
28/* track cpu usage of a group of tasks and its child groups */
29struct cpuacct {
30 struct cgroup_subsys_state css;
31 /* cpuusage holds pointer to a u64-type object on every cpu */
32 u64 __percpu *cpuusage;
33 struct kernel_cpustat __percpu *cpustat;
34};
35
36/* return cpu accounting group corresponding to this container */
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
40 struct cpuacct, css);
41}
42
43/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53}
54
55static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{
57 if (!ca->css.cgroup->parent)
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60}
61
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
63static struct cpuacct root_cpuacct = {
64 .cpustat = &kernel_cpustat,
65 .cpuusage = &root_cpuacct_cpuusage,
66};
67
68/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
70{
71 struct cpuacct *ca;
72
73 if (!cgrp->parent)
74 return &root_cpuacct.css;
75
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
77 if (!ca)
78 goto out;
79
80 ca->cpuusage = alloc_percpu(u64);
81 if (!ca->cpuusage)
82 goto out_free_ca;
83
84 ca->cpustat = alloc_percpu(struct kernel_cpustat);
85 if (!ca->cpustat)
86 goto out_free_cpuusage;
87
88 return &ca->css;
89
90out_free_cpuusage:
91 free_percpu(ca->cpuusage);
92out_free_ca:
93 kfree(ca);
94out:
95 return ERR_PTR(-ENOMEM);
96}
97
98/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp)
100{
101 struct cpuacct *ca = cgroup_ca(cgrp);
102
103 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage);
105 kfree(ca);
106}
107
108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
109{
110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
111 u64 data;
112
113#ifndef CONFIG_64BIT
114 /*
115 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
116 */
117 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
118 data = *cpuusage;
119 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
120#else
121 data = *cpuusage;
122#endif
123
124 return data;
125}
126
127static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
128{
129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
130
131#ifndef CONFIG_64BIT
132 /*
133 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
134 */
135 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
136 *cpuusage = val;
137 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
138#else
139 *cpuusage = val;
140#endif
141}
142
143/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
145{
146 struct cpuacct *ca = cgroup_ca(cgrp);
147 u64 totalcpuusage = 0;
148 int i;
149
150 for_each_present_cpu(i)
151 totalcpuusage += cpuacct_cpuusage_read(ca, i);
152
153 return totalcpuusage;
154}
155
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
157 u64 reset)
158{
159 struct cpuacct *ca = cgroup_ca(cgrp);
160 int err = 0;
161 int i;
162
163 if (reset) {
164 err = -EINVAL;
165 goto out;
166 }
167
168 for_each_present_cpu(i)
169 cpuacct_cpuusage_write(ca, i, 0);
170
171out:
172 return err;
173}
174
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
176 struct seq_file *m)
177{
178 struct cpuacct *ca = cgroup_ca(cgroup);
179 u64 percpu;
180 int i;
181
182 for_each_present_cpu(i) {
183 percpu = cpuacct_cpuusage_read(ca, i);
184 seq_printf(m, "%llu ", (unsigned long long) percpu);
185 }
186 seq_printf(m, "\n");
187 return 0;
188}
189
190static const char * const cpuacct_stat_desc[] = {
191 [CPUACCT_STAT_USER] = "user",
192 [CPUACCT_STAT_SYSTEM] = "system",
193};
194
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
196 struct cgroup_map_cb *cb)
197{
198 struct cpuacct *ca = cgroup_ca(cgrp);
199 int cpu;
200 s64 val = 0;
201
202 for_each_online_cpu(cpu) {
203 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
204 val += kcpustat->cpustat[CPUTIME_USER];
205 val += kcpustat->cpustat[CPUTIME_NICE];
206 }
207 val = cputime64_to_clock_t(val);
208 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
209
210 val = 0;
211 for_each_online_cpu(cpu) {
212 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
213 val += kcpustat->cpustat[CPUTIME_SYSTEM];
214 val += kcpustat->cpustat[CPUTIME_IRQ];
215 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
216 }
217
218 val = cputime64_to_clock_t(val);
219 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
220
221 return 0;
222}
223
224static struct cftype files[] = {
225 {
226 .name = "usage",
227 .read_u64 = cpuusage_read,
228 .write_u64 = cpuusage_write,
229 },
230 {
231 .name = "usage_percpu",
232 .read_seq_string = cpuacct_percpu_seq_read,
233 },
234 {
235 .name = "stat",
236 .read_map = cpuacct_stats_show,
237 },
238 { } /* terminate */
239};
240
241/*
242 * charge this task's execution time to its accounting group.
243 *
244 * called with rq->lock held.
245 */
246void cpuacct_charge(struct task_struct *tsk, u64 cputime)
247{
248 struct cpuacct *ca;
249 int cpu;
250
251 cpu = task_cpu(tsk);
252
253 rcu_read_lock();
254
255 ca = task_ca(tsk);
256
257 while (true) {
258 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
259 *cpuusage += cputime;
260
261 ca = parent_ca(ca);
262 if (!ca)
263 break;
264 }
265
266 rcu_read_unlock();
267}
268
269/*
270 * Add user/system time to cpuacct.
271 *
272 * Note: it's the caller that updates the account of the root cgroup.
273 */
274void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275{
276 struct kernel_cpustat *kcpustat;
277 struct cpuacct *ca;
278
279 rcu_read_lock();
280 ca = task_ca(p);
281 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca);
285 }
286 rcu_read_unlock();
287}
288
289struct cgroup_subsys cpuacct_subsys = {
290 .name = "cpuacct",
291 .css_alloc = cpuacct_css_alloc,
292 .css_free = cpuacct_css_free,
293 .subsys_id = cpuacct_subsys_id,
294 .base_cftypes = files,
295 .early_init = 1,
296};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
1#ifdef CONFIG_CGROUP_CPUACCT
2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
5
6#else
7
8static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9{
10}
11
12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val)
14{
15}
16
17#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f4..cc2dc3eea8a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
115static inline void task_group_account_field(struct task_struct *p, int index, 115static inline void task_group_account_field(struct task_struct *p, int index,
116 u64 tmp) 116 u64 tmp)
117{ 117{
118#ifdef CONFIG_CGROUP_CPUACCT
119 struct kernel_cpustat *kcpustat;
120 struct cpuacct *ca;
121#endif
122 /* 118 /*
123 * Since all updates are sure to touch the root cgroup, we 119 * Since all updates are sure to touch the root cgroup, we
124 * get ourselves ahead and touch it first. If the root cgroup 120 * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
127 */ 123 */
128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
129 125
130#ifdef CONFIG_CGROUP_CPUACCT 126 cpuacct_account_field(p, index, tmp);
131 if (unlikely(!cpuacct_subsys.active))
132 return;
133
134 rcu_read_lock();
135 ca = task_ca(p);
136 while (ca && (ca != &root_cpuacct)) {
137 kcpustat = this_cpu_ptr(ca->cpustat);
138 kcpustat->cpustat[index] += tmp;
139 ca = parent_ca(ca);
140 }
141 rcu_read_unlock();
142#endif
143} 127}
144 128
145/* 129/*
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
310 294
311 t = tsk; 295 t = tsk;
312 do { 296 do {
313 task_cputime(tsk, &utime, &stime); 297 task_cputime(t, &utime, &stime);
314 times->utime += utime; 298 times->utime += utime;
315 times->stime += stime; 299 times->stime += stime;
316 times->sum_exec_runtime += task_sched_runtime(t); 300 times->sum_exec_runtime += task_sched_runtime(t);
@@ -388,7 +372,84 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
388 struct rq *rq) {} 372 struct rq *rq) {}
389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 373#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
390 374
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 375/*
376 * Use precise platform statistics if available:
377 */
378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
379
380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
381void vtime_task_switch(struct task_struct *prev)
382{
383 if (!vtime_accounting_enabled())
384 return;
385
386 if (is_idle_task(prev))
387 vtime_account_idle(prev);
388 else
389 vtime_account_system(prev);
390
391#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
392 vtime_account_user(prev);
393#endif
394 arch_vtime_task_switch(prev);
395}
396#endif
397
398/*
399 * Archs that account the whole time spent in the idle task
400 * (outside irq) as idle time can rely on this and just implement
401 * vtime_account_system() and vtime_account_idle(). Archs that
402 * have other meaning of the idle time (s390 only includes the
403 * time spent by the CPU when it's in low power mode) must override
404 * vtime_account().
405 */
406#ifndef __ARCH_HAS_VTIME_ACCOUNT
407void vtime_account_irq_enter(struct task_struct *tsk)
408{
409 if (!vtime_accounting_enabled())
410 return;
411
412 if (!in_interrupt()) {
413 /*
414 * If we interrupted user, context_tracking_in_user()
415 * is 1 because the context tracking don't hook
416 * on irq entry/exit. This way we know if
417 * we need to flush user time on kernel entry.
418 */
419 if (context_tracking_in_user()) {
420 vtime_account_user(tsk);
421 return;
422 }
423
424 if (is_idle_task(tsk)) {
425 vtime_account_idle(tsk);
426 return;
427 }
428 }
429 vtime_account_system(tsk);
430}
431EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
432#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434
435
436#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
437void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
438{
439 *ut = p->utime;
440 *st = p->stime;
441}
442
443void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444{
445 struct task_cputime cputime;
446
447 thread_group_cputime(p, &cputime);
448
449 *ut = cputime.utime;
450 *st = cputime.stime;
451}
452#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
392/* 453/*
393 * Account a single tick of cpu time. 454 * Account a single tick of cpu time.
394 * @p: the process that the cpu time gets accounted to 455 * @p: the process that the cpu time gets accounted to
@@ -443,96 +504,50 @@ void account_idle_ticks(unsigned long ticks)
443 504
444 account_idle_time(jiffies_to_cputime(ticks)); 505 account_idle_time(jiffies_to_cputime(ticks));
445} 506}
446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
447
448/*
449 * Use precise platform statistics if available:
450 */
451#ifdef CONFIG_VIRT_CPU_ACCOUNTING
452void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
453{
454 *ut = p->utime;
455 *st = p->stime;
456}
457
458void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
459{
460 struct task_cputime cputime;
461
462 thread_group_cputime(p, &cputime);
463
464 *ut = cputime.utime;
465 *st = cputime.stime;
466}
467
468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
469void vtime_task_switch(struct task_struct *prev)
470{
471 if (!vtime_accounting_enabled())
472 return;
473
474 if (is_idle_task(prev))
475 vtime_account_idle(prev);
476 else
477 vtime_account_system(prev);
478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
480 vtime_account_user(prev);
481#endif
482 arch_vtime_task_switch(prev);
483}
484#endif
485 507
486/* 508/*
487 * Archs that account the whole time spent in the idle task 509 * Perform (stime * rtime) / total, but avoid multiplication overflow by
488 * (outside irq) as idle time can rely on this and just implement 510 * loosing precision when the numbers are big.
489 * vtime_account_system() and vtime_account_idle(). Archs that
490 * have other meaning of the idle time (s390 only includes the
491 * time spent by the CPU when it's in low power mode) must override
492 * vtime_account().
493 */ 511 */
494#ifndef __ARCH_HAS_VTIME_ACCOUNT 512static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
495void vtime_account_irq_enter(struct task_struct *tsk)
496{ 513{
497 if (!vtime_accounting_enabled()) 514 u64 scaled;
498 return;
499 515
500 if (!in_interrupt()) { 516 for (;;) {
501 /* 517 /* Make sure "rtime" is the bigger of stime/rtime */
502 * If we interrupted user, context_tracking_in_user() 518 if (stime > rtime) {
503 * is 1 because the context tracking don't hook 519 u64 tmp = rtime; rtime = stime; stime = tmp;
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 } 520 }
511 521
512 if (is_idle_task(tsk)) { 522 /* Make sure 'total' fits in 32 bits */
513 vtime_account_idle(tsk); 523 if (total >> 32)
514 return; 524 goto drop_precision;
515 }
516 }
517 vtime_account_system(tsk);
518}
519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
521 525
522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 526 /* Does rtime (and thus stime) fit in 32 bits? */
527 if (!(rtime >> 32))
528 break;
523 529
524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) 530 /* Can we just balance rtime/stime rather than dropping bits? */
525{ 531 if (stime >> 31)
526 u64 temp = (__force u64) rtime; 532 goto drop_precision;
527 533
528 temp *= (__force u64) stime; 534 /* We can grow stime and shrink rtime and try to make them both fit */
535 stime <<= 1;
536 rtime >>= 1;
537 continue;
529 538
530 if (sizeof(cputime_t) == 4) 539drop_precision:
531 temp = div_u64(temp, (__force u32) total); 540 /* We drop from rtime, it has more bits than stime */
532 else 541 rtime >>= 1;
533 temp = div64_u64(temp, (__force u64) total); 542 total >>= 1;
543 }
534 544
535 return (__force cputime_t) temp; 545 /*
546 * Make sure gcc understands that this is a 32x32->64 multiply,
547 * followed by a 64/32->64 divide.
548 */
549 scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
550 return (__force cputime_t) scaled;
536} 551}
537 552
538/* 553/*
@@ -543,7 +558,13 @@ static void cputime_adjust(struct task_cputime *curr,
543 struct cputime *prev, 558 struct cputime *prev,
544 cputime_t *ut, cputime_t *st) 559 cputime_t *ut, cputime_t *st)
545{ 560{
546 cputime_t rtime, stime, total; 561 cputime_t rtime, stime, utime, total;
562
563 if (vtime_accounting_enabled()) {
564 *ut = curr->utime;
565 *st = curr->stime;
566 return;
567 }
547 568
548 stime = curr->stime; 569 stime = curr->stime;
549 total = stime + curr->utime; 570 total = stime + curr->utime;
@@ -560,10 +581,22 @@ static void cputime_adjust(struct task_cputime *curr,
560 */ 581 */
561 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 582 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
562 583
563 if (total) 584 /*
564 stime = scale_stime(stime, rtime, total); 585 * Update userspace visible utime/stime values only if actual execution
565 else 586 * time is bigger than already exported. Note that can happen, that we
587 * provided bigger values due to scaling inaccuracy on big numbers.
588 */
589 if (prev->stime + prev->utime >= rtime)
590 goto out;
591
592 if (total) {
593 stime = scale_stime((__force u64)stime,
594 (__force u64)rtime, (__force u64)total);
595 utime = rtime - stime;
596 } else {
566 stime = rtime; 597 stime = rtime;
598 utime = 0;
599 }
567 600
568 /* 601 /*
569 * If the tick based count grows faster than the scheduler one, 602 * If the tick based count grows faster than the scheduler one,
@@ -571,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr,
571 * Let's enforce monotonicity. 604 * Let's enforce monotonicity.
572 */ 605 */
573 prev->stime = max(prev->stime, stime); 606 prev->stime = max(prev->stime, stime);
574 prev->utime = max(prev->utime, rtime - prev->stime); 607 prev->utime = max(prev->utime, utime);
575 608
609out:
576 *ut = prev->utime; 610 *ut = prev->utime;
577 *st = prev->stime; 611 *st = prev->stime;
578} 612}
@@ -597,7 +631,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
597 thread_group_cputime(p, &cputime); 631 thread_group_cputime(p, &cputime);
598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 632 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
599} 633}
600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 634#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
601 635
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 636#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk) 637static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
431 * Scheduling class tree data structure manipulation methods: 431 * Scheduling class tree data structure manipulation methods:
432 */ 432 */
433 433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 434static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
435{ 435{
436 s64 delta = (s64)(vruntime - min_vruntime); 436 s64 delta = (s64)(vruntime - max_vruntime);
437 if (delta > 0) 437 if (delta > 0)
438 min_vruntime = vruntime; 438 max_vruntime = vruntime;
439 439
440 return min_vruntime; 440 return max_vruntime;
441} 441}
442 442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
473 vruntime = min_vruntime(vruntime, se->vruntime); 473 vruntime = min_vruntime(vruntime, se->vruntime);
474 } 474 }
475 475
476 /* ensure we never gain time by being placed backwards. */
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT 478#ifndef CONFIG_64BIT
478 smp_wmb(); 479 smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652} 653}
653 654
654/* 655/*
655 * We calculate the vruntime slice of a to be inserted task 656 * We calculate the vruntime slice of a to-be-inserted task.
656 * 657 *
657 * vs = s/w 658 * vs = s/w
658 */ 659 */
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1565#else 1587#else
1566static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3874 int tsk_cache_hot = 0; 3896 int tsk_cache_hot = 0;
3875 /* 3897 /*
3876 * We do not migrate tasks that are: 3898 * We do not migrate tasks that are:
3877 * 1) running (obviously), or 3899 * 1) throttled_lb_pair, or
3878 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3900 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3879 * 3) are cache-hot on their current CPU. 3901 * 3) running (obviously), or
3902 * 4) are cache-hot on their current CPU.
3880 */ 3903 */
3904 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 return 0;
3906
3881 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3907 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3882 int new_dst_cpu; 3908 int cpu;
3883 3909
3884 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3910 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3885 3911
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3894 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3920 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3895 return 0; 3921 return 0;
3896 3922
3897 new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3923 /* Prevent to re-select dst_cpu via env's cpus */
3898 tsk_cpus_allowed(p)); 3924 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3899 if (new_dst_cpu < nr_cpu_ids) { 3925 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3900 env->flags |= LBF_SOME_PINNED; 3926 env->flags |= LBF_SOME_PINNED;
3901 env->new_dst_cpu = new_dst_cpu; 3927 env->new_dst_cpu = cpu;
3928 break;
3929 }
3902 } 3930 }
3931
3903 return 0; 3932 return 0;
3904 } 3933 }
3905 3934
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3920 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3921 if (!tsk_cache_hot || 3950 if (!tsk_cache_hot ||
3922 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3923#ifdef CONFIG_SCHEDSTATS 3952
3924 if (tsk_cache_hot) { 3953 if (tsk_cache_hot) {
3925 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3954 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3926 schedstat_inc(p, se.statistics.nr_forced_migrations); 3955 schedstat_inc(p, se.statistics.nr_forced_migrations);
3927 } 3956 }
3928#endif 3957
3929 return 1; 3958 return 1;
3930 } 3959 }
3931 3960
3932 if (tsk_cache_hot) { 3961 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3933 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3962 return 0;
3934 return 0;
3935 }
3936 return 1;
3937} 3963}
3938 3964
3939/* 3965/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
3948 struct task_struct *p, *n; 3974 struct task_struct *p, *n;
3949 3975
3950 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3976 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3951 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3952 continue;
3953
3954 if (!can_migrate_task(p, env)) 3977 if (!can_migrate_task(p, env))
3955 continue; 3978 continue;
3956 3979
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
4002 break; 4025 break;
4003 } 4026 }
4004 4027
4005 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4028 if (!can_migrate_task(p, env))
4006 goto next; 4029 goto next;
4007 4030
4008 load = task_h_load(p); 4031 load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
4013 if ((load / 2) > env->imbalance) 4036 if ((load / 2) > env->imbalance)
4014 goto next; 4037 goto next;
4015 4038
4016 if (!can_migrate_task(p, env))
4017 goto next;
4018
4019 move_task(p, env); 4039 move_task(p, env);
4020 pulled++; 4040 pulled++;
4021 env->imbalance -= load; 4041 env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
4245 return load_idx; 4265 return load_idx;
4246} 4266}
4247 4267
4248unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4268static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4249{ 4269{
4250 return SCHED_POWER_SCALE; 4270 return SCHED_POWER_SCALE;
4251} 4271}
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4255 return default_scale_freq_power(sd, cpu); 4275 return default_scale_freq_power(sd, cpu);
4256} 4276}
4257 4277
4258unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4278static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4259{ 4279{
4260 unsigned long weight = sd->span_weight; 4280 unsigned long weight = sd->span_weight;
4261 unsigned long smt_gain = sd->smt_gain; 4281 unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4270 return default_scale_smt_power(sd, cpu); 4290 return default_scale_smt_power(sd, cpu);
4271} 4291}
4272 4292
4273unsigned long scale_rt_power(int cpu) 4293static unsigned long scale_rt_power(int cpu)
4274{ 4294{
4275 struct rq *rq = cpu_rq(cpu); 4295 struct rq *rq = cpu_rq(cpu);
4276 u64 total, available, age_stamp, avg; 4296 u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4960#define MAX_PINNED_INTERVAL 512 4980#define MAX_PINNED_INTERVAL 512
4961 4981
4962/* Working cpumask for load_balance and load_balance_newidle. */ 4982/* Working cpumask for load_balance and load_balance_newidle. */
4963DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4983DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4964 4984
4965static int need_active_balance(struct lb_env *env) 4985static int need_active_balance(struct lb_env *env)
4966{ 4986{
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4991 int *balance) 5011 int *balance)
4992{ 5012{
4993 int ld_moved, cur_ld_moved, active_balance = 0; 5013 int ld_moved, cur_ld_moved, active_balance = 0;
4994 int lb_iterations, max_lb_iterations;
4995 struct sched_group *group; 5014 struct sched_group *group;
4996 struct rq *busiest; 5015 struct rq *busiest;
4997 unsigned long flags; 5016 unsigned long flags;
4998 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 5017 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
4999 5018
5000 struct lb_env env = { 5019 struct lb_env env = {
5001 .sd = sd, 5020 .sd = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5007 .cpus = cpus, 5026 .cpus = cpus,
5008 }; 5027 };
5009 5028
5029 /*
5030 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 * other cpus in our group
5032 */
5033 if (idle == CPU_NEWLY_IDLE)
5034 env.dst_grpmask = NULL;
5035
5010 cpumask_copy(cpus, cpu_active_mask); 5036 cpumask_copy(cpus, cpu_active_mask);
5011 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5012 5037
5013 schedstat_inc(sd, lb_count[idle]); 5038 schedstat_inc(sd, lb_count[idle]);
5014 5039
@@ -5034,7 +5059,6 @@ redo:
5034 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5059 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5035 5060
5036 ld_moved = 0; 5061 ld_moved = 0;
5037 lb_iterations = 1;
5038 if (busiest->nr_running > 1) { 5062 if (busiest->nr_running > 1) {
5039 /* 5063 /*
5040 * Attempt to move tasks. If find_busiest_group has found 5064 * Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
5061 double_rq_unlock(env.dst_rq, busiest); 5085 double_rq_unlock(env.dst_rq, busiest);
5062 local_irq_restore(flags); 5086 local_irq_restore(flags);
5063 5087
5064 if (env.flags & LBF_NEED_BREAK) {
5065 env.flags &= ~LBF_NEED_BREAK;
5066 goto more_balance;
5067 }
5068
5069 /* 5088 /*
5070 * some other cpu did the load balance for us. 5089 * some other cpu did the load balance for us.
5071 */ 5090 */
5072 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5091 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5073 resched_cpu(env.dst_cpu); 5092 resched_cpu(env.dst_cpu);
5074 5093
5094 if (env.flags & LBF_NEED_BREAK) {
5095 env.flags &= ~LBF_NEED_BREAK;
5096 goto more_balance;
5097 }
5098
5075 /* 5099 /*
5076 * Revisit (affine) tasks on src_cpu that couldn't be moved to 5100 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5077 * us and move them to an alternate dst_cpu in our sched_group 5101 * us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
5091 * moreover subsequent load balance cycles should correct the 5115 * moreover subsequent load balance cycles should correct the
5092 * excess load moved. 5116 * excess load moved.
5093 */ 5117 */
5094 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5118 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5095 lb_iterations++ < max_lb_iterations) {
5096 5119
5097 env.dst_rq = cpu_rq(env.new_dst_cpu); 5120 env.dst_rq = cpu_rq(env.new_dst_cpu);
5098 env.dst_cpu = env.new_dst_cpu; 5121 env.dst_cpu = env.new_dst_cpu;
5099 env.flags &= ~LBF_SOME_PINNED; 5122 env.flags &= ~LBF_SOME_PINNED;
5100 env.loop = 0; 5123 env.loop = 0;
5101 env.loop_break = sched_nr_migrate_break; 5124 env.loop_break = sched_nr_migrate_break;
5125
5126 /* Prevent to re-select dst_cpu via env's cpus */
5127 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128
5102 /* 5129 /*
5103 * Go back to "more_balance" rather than "redo" since we 5130 * Go back to "more_balance" rather than "redo" since we
5104 * need to continue with same src_cpu. 5131 * need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5219 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5246 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5220 return; 5247 return;
5221 5248
5222 update_rq_runnable_avg(this_rq, 1);
5223
5224 /* 5249 /*
5225 * Drop the rq->lock, but keep IRQ/preempt disabled. 5250 * Drop the rq->lock, but keep IRQ/preempt disabled.
5226 */ 5251 */
@@ -5330,7 +5355,7 @@ out_unlock:
5330 return 0; 5355 return 0;
5331} 5356}
5332 5357
5333#ifdef CONFIG_NO_HZ 5358#ifdef CONFIG_NO_HZ_COMMON
5334/* 5359/*
5335 * idle load balancing details 5360 * idle load balancing details
5336 * - When one of the busy CPUs notice that there may be an idle rebalancing 5361 * - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
5395 struct sched_domain *sd; 5420 struct sched_domain *sd;
5396 int cpu = smp_processor_id(); 5421 int cpu = smp_processor_id();
5397 5422
5398 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5399 return;
5400 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5401
5402 rcu_read_lock(); 5423 rcu_read_lock();
5403 for_each_domain(cpu, sd) 5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425
5426 if (!sd || !sd->nohz_idle)
5427 goto unlock;
5428 sd->nohz_idle = 0;
5429
5430 for (; sd; sd = sd->parent)
5404 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5431 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432unlock:
5405 rcu_read_unlock(); 5433 rcu_read_unlock();
5406} 5434}
5407 5435
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
5410 struct sched_domain *sd; 5438 struct sched_domain *sd;
5411 int cpu = smp_processor_id(); 5439 int cpu = smp_processor_id();
5412 5440
5413 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5414 return;
5415 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5416
5417 rcu_read_lock(); 5441 rcu_read_lock();
5418 for_each_domain(cpu, sd) 5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443
5444 if (!sd || sd->nohz_idle)
5445 goto unlock;
5446 sd->nohz_idle = 1;
5447
5448 for (; sd; sd = sd->parent)
5419 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5449 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450unlock:
5420 rcu_read_unlock(); 5451 rcu_read_unlock();
5421} 5452}
5422 5453
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
5468 * It checks each scheduling domain to see if it is due to be balanced, 5499 * It checks each scheduling domain to see if it is due to be balanced,
5469 * and initiates a balancing operation if so. 5500 * and initiates a balancing operation if so.
5470 * 5501 *
5471 * Balancing parameters are set up in arch_init_sched_domains. 5502 * Balancing parameters are set up in init_sched_domains.
5472 */ 5503 */
5473static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5504static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5474{ 5505{
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5506 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5537 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5507 if (load_balance(cpu, rq, sd, idle, &balance)) { 5538 if (load_balance(cpu, rq, sd, idle, &balance)) {
5508 /* 5539 /*
5509 * We've pulled tasks over so either we're no 5540 * The LBF_SOME_PINNED logic could have changed
5510 * longer idle. 5541 * env->dst_cpu, so we can't know our idle
5542 * state even if we migrated tasks. Update it.
5511 */ 5543 */
5512 idle = CPU_NOT_IDLE; 5544 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5513 } 5545 }
5514 sd->last_balance = jiffies; 5546 sd->last_balance = jiffies;
5515 } 5547 }
@@ -5540,9 +5572,9 @@ out:
5540 rq->next_balance = next_balance; 5572 rq->next_balance = next_balance;
5541} 5573}
5542 5574
5543#ifdef CONFIG_NO_HZ 5575#ifdef CONFIG_NO_HZ_COMMON
5544/* 5576/*
5545 * In CONFIG_NO_HZ case, the idle balance kickee will do the 5577 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
5546 * rebalancing for all the cpus for whom scheduler ticks are stopped. 5578 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5547 */ 5579 */
5548static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 5580static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5685,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
5685 if (time_after_eq(jiffies, rq->next_balance) && 5717 if (time_after_eq(jiffies, rq->next_balance) &&
5686 likely(!on_null_domain(cpu))) 5718 likely(!on_null_domain(cpu)))
5687 raise_softirq(SCHED_SOFTIRQ); 5719 raise_softirq(SCHED_SOFTIRQ);
5688#ifdef CONFIG_NO_HZ 5720#ifdef CONFIG_NO_HZ_COMMON
5689 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5721 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5690 nohz_balancer_kick(cpu); 5722 nohz_balancer_kick(cpu);
5691#endif 5723#endif
@@ -6155,7 +6187,7 @@ __init void init_sched_fair_class(void)
6155#ifdef CONFIG_SMP 6187#ifdef CONFIG_SMP
6156 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 6188 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6157 6189
6158#ifdef CONFIG_NO_HZ 6190#ifdef CONFIG_NO_HZ_COMMON
6159 nohz.next_balance = jiffies; 6191 nohz.next_balance = jiffies;
6160 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 6192 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6161 cpu_notifier(sched_ilb_notifier, 0); 6193 cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 46SCHED_FEAT(LB_BIAS, true)
47 47
48/* 48/*
49 * Spin-wait on mutex acquisition when the mutex owner is running on
50 * another cpu -- assumes that when the owner is running, it will soon
51 * release the lock. Decreases scheduling overhead.
52 */
53SCHED_FEAT(OWNER_SPIN, true)
54
55/*
56 * Decrement CPU power based on time not spent running tasks 49 * Decrement CPU power based on time not spent running tasks
57 */ 50 */
58SCHED_FEAT(NONTASK_POWER, true) 51SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..d8da01008d39 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
16#endif /* CONFIG_SMP */ 27#endif /* CONFIG_SMP */
17/* 28/*
18 * Idle tasks are unconditionally rescheduled: 29 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 36static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 37{
27 schedstat_inc(rq, sched_goidle); 38 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */
41 rq->post_schedule = 1;
42#endif
28 return rq->idle; 43 return rq->idle;
29} 44}
30 45
@@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = {
86 101
87#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
89#endif 106#endif
90 107
91 .set_curr_task = set_curr_task_idle, 108 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..ce39224d6155 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,8 +5,10 @@
5#include <linux/mutex.h> 5#include <linux/mutex.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h>
8 9
9#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h"
10 12
11extern __read_mostly int scheduler_running; 13extern __read_mostly int scheduler_running;
12 14
@@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running;
33 */ 35 */
34#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 36#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
35 37
38/*
39 * Increase resolution of nice-level calculations for 64-bit architectures.
40 * The extra resolution improves shares distribution and load balancing of
41 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
42 * hierarchies, especially on larger systems. This is not a user-visible change
43 * and does not change the user-interface for setting shares/weights.
44 *
45 * We increase resolution only if we have enough bits to allow this increased
46 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
47 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
48 * increased costs.
49 */
50#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
51# define SCHED_LOAD_RESOLUTION 10
52# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
53# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
54#else
55# define SCHED_LOAD_RESOLUTION 0
56# define scale_load(w) (w)
57# define scale_load_down(w) (w)
58#endif
59
60#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
61#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
62
36#define NICE_0_LOAD SCHED_LOAD_SCALE 63#define NICE_0_LOAD SCHED_LOAD_SCALE
37#define NICE_0_SHIFT SCHED_LOAD_SHIFT 64#define NICE_0_SHIFT SCHED_LOAD_SHIFT
38 65
@@ -154,11 +181,6 @@ struct task_group {
154#define MAX_SHARES (1UL << 18) 181#define MAX_SHARES (1UL << 18)
155#endif 182#endif
156 183
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *); 184typedef int (*tg_visitor)(struct task_group *, void *);
163 185
164extern int walk_tg_tree_from(struct task_group *from, 186extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu, 218 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent); 219 struct sched_rt_entity *parent);
198 220
221extern struct task_group *sched_create_group(struct task_group *parent);
222extern void sched_online_group(struct task_group *tg,
223 struct task_group *parent);
224extern void sched_destroy_group(struct task_group *tg);
225extern void sched_offline_group(struct task_group *tg);
226
227extern void sched_move_task(struct task_struct *tsk);
228
229#ifdef CONFIG_FAIR_GROUP_SCHED
230extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
231#endif
232
199#else /* CONFIG_CGROUP_SCHED */ 233#else /* CONFIG_CGROUP_SCHED */
200 234
201struct cfs_bandwidth { }; 235struct cfs_bandwidth { };
@@ -372,10 +406,13 @@ struct rq {
372 #define CPU_LOAD_IDX_MAX 5 406 #define CPU_LOAD_IDX_MAX 5
373 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 407 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
374 unsigned long last_load_update_tick; 408 unsigned long last_load_update_tick;
375#ifdef CONFIG_NO_HZ 409#ifdef CONFIG_NO_HZ_COMMON
376 u64 nohz_stamp; 410 u64 nohz_stamp;
377 unsigned long nohz_flags; 411 unsigned long nohz_flags;
378#endif 412#endif
413#ifdef CONFIG_NO_HZ_FULL
414 unsigned long last_sched_tick;
415#endif
379 int skip_clock_update; 416 int skip_clock_update;
380 417
381 /* capture load from *all* tasks on this cpu: */ 418 /* capture load from *all* tasks on this cpu: */
@@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
547DECLARE_PER_CPU(struct sched_domain *, sd_llc); 584DECLARE_PER_CPU(struct sched_domain *, sd_llc);
548DECLARE_PER_CPU(int, sd_llc_id); 585DECLARE_PER_CPU(int, sd_llc_id);
549 586
587struct sched_group_power {
588 atomic_t ref;
589 /*
590 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
591 * single CPU.
592 */
593 unsigned int power, power_orig;
594 unsigned long next_update;
595 /*
596 * Number of busy cpus in this group.
597 */
598 atomic_t nr_busy_cpus;
599
600 unsigned long cpumask[0]; /* iteration mask */
601};
602
603struct sched_group {
604 struct sched_group *next; /* Must be a circular list */
605 atomic_t ref;
606
607 unsigned int group_weight;
608 struct sched_group_power *sgp;
609
610 /*
611 * The CPUs this group covers.
612 *
613 * NOTE: this field is variable length. (Allocated dynamically
614 * by attaching extra space to the end of the structure,
615 * depending on how many CPUs the kernel has booted up with)
616 */
617 unsigned long cpumask[0];
618};
619
620static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
621{
622 return to_cpumask(sg->cpumask);
623}
624
625/*
626 * cpumask masking which cpus in the group are allowed to iterate up the domain
627 * tree.
628 */
629static inline struct cpumask *sched_group_mask(struct sched_group *sg)
630{
631 return to_cpumask(sg->sgp->cpumask);
632}
633
634/**
635 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
636 * @group: The group whose first cpu is to be returned.
637 */
638static inline unsigned int group_first_cpu(struct sched_group *group)
639{
640 return cpumask_first(sched_group_cpus(group));
641}
642
550extern int group_balance_cpu(struct sched_group *sg); 643extern int group_balance_cpu(struct sched_group *sg);
551 644
552#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
@@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
784} 877}
785#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 878#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
786 879
880/*
881 * wake flags
882 */
883#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
884#define WF_FORK 0x02 /* child wakeup after fork */
885#define WF_MIGRATED 0x4 /* internal use, task got migrated */
787 886
788static inline void update_load_add(struct load_weight *lw, unsigned long inc) 887static inline void update_load_add(struct load_weight *lw, unsigned long inc)
789{ 888{
@@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = {
856 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 955 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
857}; 956};
858 957
859/* Time spent by the tasks of the cpu accounting group executing in ... */ 958#define ENQUEUE_WAKEUP 1
860enum cpuacct_stat_index { 959#define ENQUEUE_HEAD 2
861 CPUACCT_STAT_USER, /* ... user mode */ 960#ifdef CONFIG_SMP
862 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 961#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
962#else
963#define ENQUEUE_WAKING 0
964#endif
863 965
864 CPUACCT_STAT_NSTATS, 966#define DEQUEUE_SLEEP 1
865};
866 967
968struct sched_class {
969 const struct sched_class *next;
970
971 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
972 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
973 void (*yield_task) (struct rq *rq);
974 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
975
976 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
977
978 struct task_struct * (*pick_next_task) (struct rq *rq);
979 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
980
981#ifdef CONFIG_SMP
982 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
983 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
984
985 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
986 void (*post_schedule) (struct rq *this_rq);
987 void (*task_waking) (struct task_struct *task);
988 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
989
990 void (*set_cpus_allowed)(struct task_struct *p,
991 const struct cpumask *newmask);
992
993 void (*rq_online)(struct rq *rq);
994 void (*rq_offline)(struct rq *rq);
995#endif
996
997 void (*set_curr_task) (struct rq *rq);
998 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
999 void (*task_fork) (struct task_struct *p);
1000
1001 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1002 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1003 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1004 int oldprio);
1005
1006 unsigned int (*get_rr_interval) (struct rq *rq,
1007 struct task_struct *task);
1008
1009#ifdef CONFIG_FAIR_GROUP_SCHED
1010 void (*task_move_group) (struct task_struct *p, int on_rq);
1011#endif
1012};
867 1013
868#define sched_class_highest (&stop_sched_class) 1014#define sched_class_highest (&stop_sched_class)
869#define for_each_class(class) \ 1015#define for_each_class(class) \
@@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class;
877 1023
878#ifdef CONFIG_SMP 1024#ifdef CONFIG_SMP
879 1025
1026extern void update_group_power(struct sched_domain *sd, int cpu);
1027
880extern void trigger_load_balance(struct rq *rq, int cpu); 1028extern void trigger_load_balance(struct rq *rq, int cpu);
881extern void idle_balance(int this_cpu, struct rq *this_rq); 1029extern void idle_balance(int this_cpu, struct rq *this_rq);
882 1030
1031/*
1032 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1033 * becomes useful in lb
1034 */
1035#if defined(CONFIG_FAIR_GROUP_SCHED)
1036extern void idle_enter_fair(struct rq *this_rq);
1037extern void idle_exit_fair(struct rq *this_rq);
1038#else
1039static inline void idle_enter_fair(struct rq *this_rq) {}
1040static inline void idle_exit_fair(struct rq *this_rq) {}
1041#endif
1042
883#else /* CONFIG_SMP */ 1043#else /* CONFIG_SMP */
884 1044
885static inline void idle_balance(int cpu, struct rq *rq) 1045static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
891extern void sysrq_sched_debug_show(void); 1051extern void sysrq_sched_debug_show(void);
892extern void sched_init_granularity(void); 1052extern void sched_init_granularity(void);
893extern void update_max_interval(void); 1053extern void update_max_interval(void);
894extern void update_group_power(struct sched_domain *sd, int cpu);
895extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 1054extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
896extern void init_sched_rt_class(void); 1055extern void init_sched_rt_class(void);
897extern void init_sched_fair_class(void); 1056extern void init_sched_fair_class(void);
@@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
904 1063
905extern void update_idle_cpu_load(struct rq *this_rq); 1064extern void update_idle_cpu_load(struct rq *this_rq);
906 1065
907#ifdef CONFIG_CGROUP_CPUACCT
908#include <linux/cgroup.h>
909/* track cpu usage of a group of tasks and its child groups */
910struct cpuacct {
911 struct cgroup_subsys_state css;
912 /* cpuusage holds pointer to a u64-type object on every cpu */
913 u64 __percpu *cpuusage;
914 struct kernel_cpustat __percpu *cpustat;
915};
916
917extern struct cgroup_subsys cpuacct_subsys;
918extern struct cpuacct root_cpuacct;
919
920/* return cpu accounting group corresponding to this container */
921static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
922{
923 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
924 struct cpuacct, css);
925}
926
927/* return cpu accounting group to which this task belongs */
928static inline struct cpuacct *task_ca(struct task_struct *tsk)
929{
930 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
931 struct cpuacct, css);
932}
933
934static inline struct cpuacct *parent_ca(struct cpuacct *ca)
935{
936 if (!ca || !ca->css.cgroup->parent)
937 return NULL;
938 return cgroup_ca(ca->css.cgroup->parent);
939}
940
941extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
942#else
943static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
944#endif
945
946#ifdef CONFIG_PARAVIRT 1066#ifdef CONFIG_PARAVIRT
947static inline u64 steal_ticks(u64 steal) 1067static inline u64 steal_ticks(u64 steal)
948{ 1068{
@@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)
956static inline void inc_nr_running(struct rq *rq) 1076static inline void inc_nr_running(struct rq *rq)
957{ 1077{
958 rq->nr_running++; 1078 rq->nr_running++;
1079
1080#ifdef CONFIG_NO_HZ_FULL
1081 if (rq->nr_running == 2) {
1082 if (tick_nohz_full_cpu(rq->cpu)) {
1083 /* Order rq->nr_running write against the IPI */
1084 smp_wmb();
1085 smp_send_reschedule(rq->cpu);
1086 }
1087 }
1088#endif
959} 1089}
960 1090
961static inline void dec_nr_running(struct rq *rq) 1091static inline void dec_nr_running(struct rq *rq)
@@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)
963 rq->nr_running--; 1093 rq->nr_running--;
964} 1094}
965 1095
1096static inline void rq_last_tick_reset(struct rq *rq)
1097{
1098#ifdef CONFIG_NO_HZ_FULL
1099 rq->last_sched_tick = jiffies;
1100#endif
1101}
1102
966extern void update_rq_clock(struct rq *rq); 1103extern void update_rq_clock(struct rq *rq);
967 1104
968extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1105extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1183 1320
1184extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1321extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1185 1322
1186#ifdef CONFIG_NO_HZ 1323#ifdef CONFIG_NO_HZ_COMMON
1187enum rq_nohz_flag_bits { 1324enum rq_nohz_flag_bits {
1188 NOHZ_TICK_STOPPED, 1325 NOHZ_TICK_STOPPED,
1189 NOHZ_BALANCE_KICK, 1326 NOHZ_BALANCE_KICK,
1190 NOHZ_IDLE,
1191}; 1327};
1192 1328
1193#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1329#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index e036eda1a9c9..da98af347e8b 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file)
130 return seq_open(file, &schedstat_sops); 130 return seq_open(file, &schedstat_sops);
131} 131}
132 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
138static const struct file_operations proc_schedstat_operations = { 133static const struct file_operations proc_schedstat_operations = {
139 .open = schedstat_open, 134 .open = schedstat_open,
140 .read = seq_read, 135 .read = seq_read,
141 .llseek = seq_lseek, 136 .llseek = seq_lseek,
142 .release = schedstat_release, 137 .release = seq_release,
143}; 138};
144 139
145static int __init proc_schedstat_init(void) 140static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b593770..b7a10048a32c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
160 case BPF_S_ALU_AND_X: 160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K: 161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X: 162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_XOR_K:
164 case BPF_S_ALU_XOR_X:
163 case BPF_S_ALU_LSH_K: 165 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X: 166 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K: 167 case BPF_S_ALU_RSH_K:
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe3..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up);
193struct semaphore_waiter { 193struct semaphore_waiter {
194 struct list_head list; 194 struct list_head list;
195 struct task_struct *task; 195 struct task_struct *task;
196 int up; 196 bool up;
197}; 197};
198 198
199/* 199/*
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
209 209
210 list_add_tail(&waiter.list, &sem->wait_list); 210 list_add_tail(&waiter.list, &sem->wait_list);
211 waiter.task = task; 211 waiter.task = task;
212 waiter.up = 0; 212 waiter.up = false;
213 213
214 for (;;) { 214 for (;;) {
215 if (signal_pending_state(state, task)) 215 if (signal_pending_state(state, task))
216 goto interrupted; 216 goto interrupted;
217 if (timeout <= 0) 217 if (unlikely(timeout <= 0))
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 raw_spin_unlock_irq(&sem->lock); 220 raw_spin_unlock_irq(&sem->lock);
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem)
258 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, 258 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
259 struct semaphore_waiter, list); 259 struct semaphore_waiter, list);
260 list_del(&waiter->list); 260 list_del(&waiter->list);
261 waiter->up = 1; 261 waiter->up = true;
262 wake_up_process(waiter->task); 262 wake_up_process(waiter->task);
263} 263}
diff --git a/kernel/signal.c b/kernel/signal.c
index dd72567767d9..113411bfe8b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,6 +32,7 @@
32#include <linux/user_namespace.h> 32#include <linux/user_namespace.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/compat.h> 34#include <linux/compat.h>
35#include <linux/cn_proc.h>
35#define CREATE_TRACE_POINTS 36#define CREATE_TRACE_POINTS
36#include <trace/events/signal.h> 37#include <trace/events/signal.h>
37 38
@@ -854,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t)
854 * Returns true if the signal should be actually delivered, otherwise 855 * Returns true if the signal should be actually delivered, otherwise
855 * it should be dropped. 856 * it should be dropped.
856 */ 857 */
857static int prepare_signal(int sig, struct task_struct *p, bool force) 858static bool prepare_signal(int sig, struct task_struct *p, bool force)
858{ 859{
859 struct signal_struct *signal = p->signal; 860 struct signal_struct *signal = p->signal;
860 struct task_struct *t; 861 struct task_struct *t;
861 862
862 if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { 863 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
864 if (signal->flags & SIGNAL_GROUP_COREDUMP)
865 return sig == SIGKILL;
863 /* 866 /*
864 * The process is in the middle of dying, nothing to do. 867 * The process is in the middle of dying, nothing to do.
865 */ 868 */
@@ -1160,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1160static void print_fatal_signal(int signr) 1163static void print_fatal_signal(int signr)
1161{ 1164{
1162 struct pt_regs *regs = signal_pt_regs(); 1165 struct pt_regs *regs = signal_pt_regs();
1163 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", 1166 printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
1164 current->comm, task_pid_nr(current), signr);
1165 1167
1166#if defined(__i386__) && !defined(__arch_um__) 1168#if defined(__i386__) && !defined(__arch_um__)
1167 printk(KERN_INFO "code at %08lx: ", regs->ip); 1169 printk(KERN_INFO "code at %08lx: ", regs->ip);
@@ -2350,6 +2352,7 @@ relock:
2350 if (sig_kernel_coredump(signr)) { 2352 if (sig_kernel_coredump(signr)) {
2351 if (print_fatal_signals) 2353 if (print_fatal_signals)
2352 print_fatal_signal(info->si_signo); 2354 print_fatal_signal(info->si_signo);
2355 proc_coredump_connector(current);
2353 /* 2356 /*
2354 * If it was able to dump core, this kills all 2357 * If it was able to dump core, this kills all
2355 * other threads in the group and synchronizes with 2358 * other threads in the group and synchronizes with
@@ -2948,7 +2951,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2948 2951
2949static int do_tkill(pid_t tgid, pid_t pid, int sig) 2952static int do_tkill(pid_t tgid, pid_t pid, int sig)
2950{ 2953{
2951 struct siginfo info; 2954 struct siginfo info = {};
2952 2955
2953 info.si_signo = sig; 2956 info.si_signo = sig;
2954 info.si_errno = 0; 2957 info.si_errno = 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e451f3ff51b..4dba0f7b72ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -100,16 +100,16 @@ void __init call_function_init(void)
100 * previous function call. For multi-cpu calls its even more interesting 100 * previous function call. For multi-cpu calls its even more interesting
101 * as we'll have to ensure no other cpu is observing our csd. 101 * as we'll have to ensure no other cpu is observing our csd.
102 */ 102 */
103static void csd_lock_wait(struct call_single_data *data) 103static void csd_lock_wait(struct call_single_data *csd)
104{ 104{
105 while (data->flags & CSD_FLAG_LOCK) 105 while (csd->flags & CSD_FLAG_LOCK)
106 cpu_relax(); 106 cpu_relax();
107} 107}
108 108
109static void csd_lock(struct call_single_data *data) 109static void csd_lock(struct call_single_data *csd)
110{ 110{
111 csd_lock_wait(data); 111 csd_lock_wait(csd);
112 data->flags = CSD_FLAG_LOCK; 112 csd->flags |= CSD_FLAG_LOCK;
113 113
114 /* 114 /*
115 * prevent CPU from reordering the above assignment 115 * prevent CPU from reordering the above assignment
@@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data)
119 smp_mb(); 119 smp_mb();
120} 120}
121 121
122static void csd_unlock(struct call_single_data *data) 122static void csd_unlock(struct call_single_data *csd)
123{ 123{
124 WARN_ON(!(data->flags & CSD_FLAG_LOCK)); 124 WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
125 125
126 /* 126 /*
127 * ensure we're all done before releasing data: 127 * ensure we're all done before releasing data:
128 */ 128 */
129 smp_mb(); 129 smp_mb();
130 130
131 data->flags &= ~CSD_FLAG_LOCK; 131 csd->flags &= ~CSD_FLAG_LOCK;
132} 132}
133 133
134/* 134/*
@@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data)
137 * ->func, ->info, and ->flags set. 137 * ->func, ->info, and ->flags set.
138 */ 138 */
139static 139static
140void generic_exec_single(int cpu, struct call_single_data *data, int wait) 140void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
141{ 141{
142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
143 unsigned long flags; 143 unsigned long flags;
@@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
145 145
146 raw_spin_lock_irqsave(&dst->lock, flags); 146 raw_spin_lock_irqsave(&dst->lock, flags);
147 ipi = list_empty(&dst->list); 147 ipi = list_empty(&dst->list);
148 list_add_tail(&data->list, &dst->list); 148 list_add_tail(&csd->list, &dst->list);
149 raw_spin_unlock_irqrestore(&dst->lock, flags); 149 raw_spin_unlock_irqrestore(&dst->lock, flags);
150 150
151 /* 151 /*
@@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
163 arch_send_call_function_single_ipi(cpu); 163 arch_send_call_function_single_ipi(cpu);
164 164
165 if (wait) 165 if (wait)
166 csd_lock_wait(data); 166 csd_lock_wait(csd);
167} 167}
168 168
169/* 169/*
@@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
173void generic_smp_call_function_single_interrupt(void) 173void generic_smp_call_function_single_interrupt(void)
174{ 174{
175 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 175 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
176 unsigned int data_flags;
177 LIST_HEAD(list); 176 LIST_HEAD(list);
178 177
179 /* 178 /*
@@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
186 raw_spin_unlock(&q->lock); 185 raw_spin_unlock(&q->lock);
187 186
188 while (!list_empty(&list)) { 187 while (!list_empty(&list)) {
189 struct call_single_data *data; 188 struct call_single_data *csd;
189 unsigned int csd_flags;
190 190
191 data = list_entry(list.next, struct call_single_data, list); 191 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&data->list); 192 list_del(&csd->list);
193 193
194 /* 194 /*
195 * 'data' can be invalid after this call if flags == 0 195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()), 196 * (when called through generic_exec_single()),
197 * so save them away before making the call: 197 * so save them away before making the call:
198 */ 198 */
199 data_flags = data->flags; 199 csd_flags = csd->flags;
200 200
201 data->func(data->info); 201 csd->func(csd->info);
202 202
203 /* 203 /*
204 * Unlocked CSDs are valid through generic_exec_single(): 204 * Unlocked CSDs are valid through generic_exec_single():
205 */ 205 */
206 if (data_flags & CSD_FLAG_LOCK) 206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(data); 207 csd_unlock(csd);
208 } 208 }
209} 209}
210 210
@@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
249 local_irq_restore(flags); 249 local_irq_restore(flags);
250 } else { 250 } else {
251 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 251 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
252 struct call_single_data *data = &d; 252 struct call_single_data *csd = &d;
253 253
254 if (!wait) 254 if (!wait)
255 data = &__get_cpu_var(csd_data); 255 csd = &__get_cpu_var(csd_data);
256 256
257 csd_lock(data); 257 csd_lock(csd);
258 258
259 data->func = func; 259 csd->func = func;
260 data->info = info; 260 csd->info = info;
261 generic_exec_single(cpu, data, wait); 261 generic_exec_single(cpu, csd, wait);
262 } else { 262 } else {
263 err = -ENXIO; /* CPU not online */ 263 err = -ENXIO; /* CPU not online */
264 } 264 }
@@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
325 * pre-allocated data structure. Useful for embedding @data inside 325 * pre-allocated data structure. Useful for embedding @data inside
326 * other structures, for instance. 326 * other structures, for instance.
327 */ 327 */
328void __smp_call_function_single(int cpu, struct call_single_data *data, 328void __smp_call_function_single(int cpu, struct call_single_data *csd,
329 int wait) 329 int wait)
330{ 330{
331 unsigned int this_cpu; 331 unsigned int this_cpu;
@@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
343 343
344 if (cpu == this_cpu) { 344 if (cpu == this_cpu) {
345 local_irq_save(flags); 345 local_irq_save(flags);
346 data->func(data->info); 346 csd->func(csd->info);
347 local_irq_restore(flags); 347 local_irq_restore(flags);
348 } else { 348 } else {
349 csd_lock(data); 349 csd_lock(csd);
350 generic_exec_single(cpu, data, wait); 350 generic_exec_single(cpu, csd, wait);
351 } 351 }
352 put_cpu(); 352 put_cpu();
353} 353}
@@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
369void smp_call_function_many(const struct cpumask *mask, 369void smp_call_function_many(const struct cpumask *mask,
370 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
371{ 371{
372 struct call_function_data *data; 372 struct call_function_data *cfd;
373 int cpu, next_cpu, this_cpu = smp_processor_id(); 373 int cpu, next_cpu, this_cpu = smp_processor_id();
374 374
375 /* 375 /*
@@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask,
401 return; 401 return;
402 } 402 }
403 403
404 data = &__get_cpu_var(cfd_data); 404 cfd = &__get_cpu_var(cfd_data);
405 405
406 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
407 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, cfd->cpumask);
408 408
409 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
410 if (unlikely(!cpumask_weight(data->cpumask))) 410 if (unlikely(!cpumask_weight(cfd->cpumask)))
411 return; 411 return;
412 412
413 /* 413 /*
414 * After we put an entry into the list, data->cpumask 414 * After we put an entry into the list, cfd->cpumask may be cleared
415 * may be cleared again when another CPU sends another IPI for 415 * again when another CPU sends another IPI for a SMP function call, so
416 * a SMP function call, so data->cpumask will be zero. 416 * cfd->cpumask will be zero.
417 */ 417 */
418 cpumask_copy(data->cpumask_ipi, data->cpumask); 418 cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
419 419
420 for_each_cpu(cpu, data->cpumask) { 420 for_each_cpu(cpu, cfd->cpumask) {
421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); 421 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
422 struct call_single_queue *dst = 422 struct call_single_queue *dst =
423 &per_cpu(call_single_queue, cpu); 423 &per_cpu(call_single_queue, cpu);
424 unsigned long flags; 424 unsigned long flags;
@@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask,
433 } 433 }
434 434
435 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
436 arch_send_call_function_ipi_mask(data->cpumask_ipi); 436 arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
437 437
438 if (wait) { 438 if (wait) {
439 for_each_cpu(cpu, data->cpumask) { 439 for_each_cpu(cpu, cfd->cpumask) {
440 struct call_single_data *csd = 440 struct call_single_data *csd;
441 per_cpu_ptr(data->csd, cpu); 441
442 csd = per_cpu_ptr(cfd->csd, cpu);
442 csd_lock_wait(csd); 443 csd_lock_wait(csd);
443 } 444 }
444 } 445 }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf0..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
185 } 185 }
186 get_task_struct(tsk); 186 get_task_struct(tsk);
187 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create) 188 if (ht->create) {
189 ht->create(cpu); 189 /*
190 * Make sure that the task has actually scheduled out
191 * into park position, before calling the create
192 * callback. At least the migration thread callback
193 * requires that the task is off the runqueue.
194 */
195 if (!wait_task_inactive(tsk, TASK_PARKED))
196 WARN_ON(1);
197 else
198 ht->create(cpu);
199 }
190 return 0; 200 return 0;
191} 201}
192 202
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074aa..b5197dcb0dad 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void)
329 wakeup_softirqd(); 329 wakeup_softirqd();
330} 330}
331 331
332static inline void tick_irq_exit(void)
333{
334#ifdef CONFIG_NO_HZ_COMMON
335 int cpu = smp_processor_id();
336
337 /* Make sure that timer wheel updates are propagated */
338 if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
339 if (!in_interrupt())
340 tick_nohz_irq_exit();
341 }
342#endif
343}
344
332/* 345/*
333 * Exit an interrupt context. Process softirqs if needed and possible: 346 * Exit an interrupt context. Process softirqs if needed and possible:
334 */ 347 */
@@ -346,11 +359,7 @@ void irq_exit(void)
346 if (!in_interrupt() && local_softirq_pending()) 359 if (!in_interrupt() && local_softirq_pending())
347 invoke_softirq(); 360 invoke_softirq();
348 361
349#ifdef CONFIG_NO_HZ 362 tick_irq_exit();
350 /* Make sure that timer wheel updates are propagated */
351 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
352 tick_nohz_irq_exit();
353#endif
354 rcu_irq_exit(); 363 rcu_irq_exit();
355} 364}
356 365
@@ -620,8 +629,7 @@ static void remote_softirq_receive(void *data)
620 unsigned long flags; 629 unsigned long flags;
621 int softirq; 630 int softirq;
622 631
623 softirq = cp->priv; 632 softirq = *(int *)cp->info;
624
625 local_irq_save(flags); 633 local_irq_save(flags);
626 __local_trigger(cp, softirq); 634 __local_trigger(cp, softirq);
627 local_irq_restore(flags); 635 local_irq_restore(flags);
@@ -631,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
631{ 639{
632 if (cpu_online(cpu)) { 640 if (cpu_online(cpu)) {
633 cp->func = remote_softirq_receive; 641 cp->func = remote_softirq_receive;
634 cp->info = cp; 642 cp->info = &softirq;
635 cp->flags = 0; 643 cp->flags = 0;
636 cp->priv = softirq;
637 644
638 __smp_call_function_single(cpu, cp, 0); 645 __smp_call_function_single(cpu, cp, 0);
639 return 0; 646 return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f56445fba9..b95d3c72ba21 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,11 @@
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h> 50#include <linux/binfmts.h>
51 51
52#include <linux/sched.h>
53#include <linux/rcupdate.h>
54#include <linux/uidgid.h>
55#include <linux/cred.h>
56
52#include <linux/kmsg_dump.h> 57#include <linux/kmsg_dump.h>
53/* Move somewhere else to avoid recompiling? */ 58/* Move somewhere else to avoid recompiling? */
54#include <generated/utsrelease.h> 59#include <generated/utsrelease.h>
@@ -324,7 +329,6 @@ void kernel_restart_prepare(char *cmd)
324 system_state = SYSTEM_RESTART; 329 system_state = SYSTEM_RESTART;
325 usermodehelper_disable(); 330 usermodehelper_disable();
326 device_shutdown(); 331 device_shutdown();
327 syscore_shutdown();
328} 332}
329 333
330/** 334/**
@@ -370,6 +374,7 @@ void kernel_restart(char *cmd)
370{ 374{
371 kernel_restart_prepare(cmd); 375 kernel_restart_prepare(cmd);
372 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
377 syscore_shutdown();
373 if (!cmd) 378 if (!cmd)
374 printk(KERN_EMERG "Restarting system.\n"); 379 printk(KERN_EMERG "Restarting system.\n");
375 else 380 else
@@ -395,6 +400,7 @@ static void kernel_shutdown_prepare(enum system_states state)
395void kernel_halt(void) 400void kernel_halt(void)
396{ 401{
397 kernel_shutdown_prepare(SYSTEM_HALT); 402 kernel_shutdown_prepare(SYSTEM_HALT);
403 disable_nonboot_cpus();
398 syscore_shutdown(); 404 syscore_shutdown();
399 printk(KERN_EMERG "System halted.\n"); 405 printk(KERN_EMERG "System halted.\n");
400 kmsg_dump(KMSG_DUMP_HALT); 406 kmsg_dump(KMSG_DUMP_HALT);
@@ -1043,6 +1049,67 @@ change_okay:
1043 return old_fsgid; 1049 return old_fsgid;
1044} 1050}
1045 1051
1052/**
1053 * sys_getpid - return the thread group id of the current process
1054 *
1055 * Note, despite the name, this returns the tgid not the pid. The tgid and
1056 * the pid are identical unless CLONE_THREAD was specified on clone() in
1057 * which case the tgid is the same in all threads of the same group.
1058 *
1059 * This is SMP safe as current->tgid does not change.
1060 */
1061SYSCALL_DEFINE0(getpid)
1062{
1063 return task_tgid_vnr(current);
1064}
1065
1066/* Thread ID - the internal kernel "pid" */
1067SYSCALL_DEFINE0(gettid)
1068{
1069 return task_pid_vnr(current);
1070}
1071
1072/*
1073 * Accessing ->real_parent is not SMP-safe, it could
1074 * change from under us. However, we can use a stale
1075 * value of ->real_parent under rcu_read_lock(), see
1076 * release_task()->call_rcu(delayed_put_task_struct).
1077 */
1078SYSCALL_DEFINE0(getppid)
1079{
1080 int pid;
1081
1082 rcu_read_lock();
1083 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1084 rcu_read_unlock();
1085
1086 return pid;
1087}
1088
1089SYSCALL_DEFINE0(getuid)
1090{
1091 /* Only we change this so SMP safe */
1092 return from_kuid_munged(current_user_ns(), current_uid());
1093}
1094
1095SYSCALL_DEFINE0(geteuid)
1096{
1097 /* Only we change this so SMP safe */
1098 return from_kuid_munged(current_user_ns(), current_euid());
1099}
1100
1101SYSCALL_DEFINE0(getgid)
1102{
1103 /* Only we change this so SMP safe */
1104 return from_kgid_munged(current_user_ns(), current_gid());
1105}
1106
1107SYSCALL_DEFINE0(getegid)
1108{
1109 /* Only we change this so SMP safe */
1110 return from_kgid_munged(current_user_ns(), current_egid());
1111}
1112
1046void do_sys_times(struct tms *tms) 1113void do_sys_times(struct tms *tms)
1047{ 1114{
1048 cputime_t tgutime, tgstime, cutime, cstime; 1115 cputime_t tgutime, tgstime, cutime, cstime;
@@ -1784,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1784 return getrusage(current, who, ru); 1851 return getrusage(current, who, ru);
1785} 1852}
1786 1853
1854#ifdef CONFIG_COMPAT
1855COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1856{
1857 struct rusage r;
1858
1859 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1860 who != RUSAGE_THREAD)
1861 return -EINVAL;
1862
1863 k_getrusage(current, who, &r);
1864 return put_compat_rusage(&r, ru);
1865}
1866#endif
1867
1787SYSCALL_DEFINE1(umask, int, mask) 1868SYSCALL_DEFINE1(umask, int, mask)
1788{ 1869{
1789 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1870 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1790 return mask; 1871 return mask;
1791} 1872}
1792 1873
1793#ifdef CONFIG_CHECKPOINT_RESTORE
1794static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1874static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1795{ 1875{
1796 struct fd exe; 1876 struct fd exe;
@@ -1984,17 +2064,12 @@ out:
1984 return error; 2064 return error;
1985} 2065}
1986 2066
2067#ifdef CONFIG_CHECKPOINT_RESTORE
1987static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2068static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1988{ 2069{
1989 return put_user(me->clear_child_tid, tid_addr); 2070 return put_user(me->clear_child_tid, tid_addr);
1990} 2071}
1991 2072#else
1992#else /* CONFIG_CHECKPOINT_RESTORE */
1993static int prctl_set_mm(int opt, unsigned long addr,
1994 unsigned long arg4, unsigned long arg5)
1995{
1996 return -EINVAL;
1997}
1998static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2073static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1999{ 2074{
2000 return -EINVAL; 2075 return -EINVAL;
@@ -2185,9 +2260,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2185 2260
2186char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 2261char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2187 2262
2188static int __orderly_poweroff(void) 2263static int __orderly_poweroff(bool force)
2189{ 2264{
2190 int argc;
2191 char **argv; 2265 char **argv;
2192 static char *envp[] = { 2266 static char *envp[] = {
2193 "HOME=/", 2267 "HOME=/",
@@ -2196,20 +2270,40 @@ static int __orderly_poweroff(void)
2196 }; 2270 };
2197 int ret; 2271 int ret;
2198 2272
2199 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2273 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
2200 if (argv == NULL) { 2274 if (argv) {
2275 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2276 argv_free(argv);
2277 } else {
2201 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2278 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2202 __func__, poweroff_cmd); 2279 __func__, poweroff_cmd);
2203 return -ENOMEM; 2280 ret = -ENOMEM;
2204 } 2281 }
2205 2282
2206 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 2283 if (ret && force) {
2207 NULL, NULL, NULL); 2284 printk(KERN_WARNING "Failed to start orderly shutdown: "
2208 argv_free(argv); 2285 "forcing the issue\n");
2286 /*
2287 * I guess this should try to kick off some daemon to sync and
2288 * poweroff asap. Or not even bother syncing if we're doing an
2289 * emergency shutdown?
2290 */
2291 emergency_sync();
2292 kernel_power_off();
2293 }
2209 2294
2210 return ret; 2295 return ret;
2211} 2296}
2212 2297
2298static bool poweroff_force;
2299
2300static void poweroff_work_func(struct work_struct *work)
2301{
2302 __orderly_poweroff(poweroff_force);
2303}
2304
2305static DECLARE_WORK(poweroff_work, poweroff_work_func);
2306
2213/** 2307/**
2214 * orderly_poweroff - Trigger an orderly system poweroff 2308 * orderly_poweroff - Trigger an orderly system poweroff
2215 * @force: force poweroff if command execution fails 2309 * @force: force poweroff if command execution fails
@@ -2219,21 +2313,154 @@ static int __orderly_poweroff(void)
2219 */ 2313 */
2220int orderly_poweroff(bool force) 2314int orderly_poweroff(bool force)
2221{ 2315{
2222 int ret = __orderly_poweroff(); 2316 if (force) /* do not override the pending "true" */
2317 poweroff_force = true;
2318 schedule_work(&poweroff_work);
2319 return 0;
2320}
2321EXPORT_SYMBOL_GPL(orderly_poweroff);
2223 2322
2224 if (ret && force) { 2323/**
2225 printk(KERN_WARNING "Failed to start orderly shutdown: " 2324 * do_sysinfo - fill in sysinfo struct
2226 "forcing the issue\n"); 2325 * @info: pointer to buffer to fill
2326 */
2327static int do_sysinfo(struct sysinfo *info)
2328{
2329 unsigned long mem_total, sav_total;
2330 unsigned int mem_unit, bitcount;
2331 struct timespec tp;
2227 2332
2228 /* 2333 memset(info, 0, sizeof(struct sysinfo));
2229 * I guess this should try to kick off some daemon to sync and 2334
2230 * poweroff asap. Or not even bother syncing if we're doing an 2335 ktime_get_ts(&tp);
2231 * emergency shutdown? 2336 monotonic_to_bootbased(&tp);
2232 */ 2337 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2233 emergency_sync(); 2338
2234 kernel_power_off(); 2339 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
2340
2341 info->procs = nr_threads;
2342
2343 si_meminfo(info);
2344 si_swapinfo(info);
2345
2346 /*
2347 * If the sum of all the available memory (i.e. ram + swap)
2348 * is less than can be stored in a 32 bit unsigned long then
2349 * we can be binary compatible with 2.2.x kernels. If not,
2350 * well, in that case 2.2.x was broken anyways...
2351 *
2352 * -Erik Andersen <andersee@debian.org>
2353 */
2354
2355 mem_total = info->totalram + info->totalswap;
2356 if (mem_total < info->totalram || mem_total < info->totalswap)
2357 goto out;
2358 bitcount = 0;
2359 mem_unit = info->mem_unit;
2360 while (mem_unit > 1) {
2361 bitcount++;
2362 mem_unit >>= 1;
2363 sav_total = mem_total;
2364 mem_total <<= 1;
2365 if (mem_total < sav_total)
2366 goto out;
2235 } 2367 }
2236 2368
2237 return ret; 2369 /*
2370 * If mem_total did not overflow, multiply all memory values by
2371 * info->mem_unit and set it to 1. This leaves things compatible
2372 * with 2.2.x, and also retains compatibility with earlier 2.4.x
2373 * kernels...
2374 */
2375
2376 info->mem_unit = 1;
2377 info->totalram <<= bitcount;
2378 info->freeram <<= bitcount;
2379 info->sharedram <<= bitcount;
2380 info->bufferram <<= bitcount;
2381 info->totalswap <<= bitcount;
2382 info->freeswap <<= bitcount;
2383 info->totalhigh <<= bitcount;
2384 info->freehigh <<= bitcount;
2385
2386out:
2387 return 0;
2238} 2388}
2239EXPORT_SYMBOL_GPL(orderly_poweroff); 2389
2390SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
2391{
2392 struct sysinfo val;
2393
2394 do_sysinfo(&val);
2395
2396 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
2397 return -EFAULT;
2398
2399 return 0;
2400}
2401
2402#ifdef CONFIG_COMPAT
2403struct compat_sysinfo {
2404 s32 uptime;
2405 u32 loads[3];
2406 u32 totalram;
2407 u32 freeram;
2408 u32 sharedram;
2409 u32 bufferram;
2410 u32 totalswap;
2411 u32 freeswap;
2412 u16 procs;
2413 u16 pad;
2414 u32 totalhigh;
2415 u32 freehigh;
2416 u32 mem_unit;
2417 char _f[20-2*sizeof(u32)-sizeof(int)];
2418};
2419
2420COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2421{
2422 struct sysinfo s;
2423
2424 do_sysinfo(&s);
2425
2426 /* Check to see if any memory value is too large for 32-bit and scale
2427 * down if needed
2428 */
2429 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
2430 int bitcount = 0;
2431
2432 while (s.mem_unit < PAGE_SIZE) {
2433 s.mem_unit <<= 1;
2434 bitcount++;
2435 }
2436
2437 s.totalram >>= bitcount;
2438 s.freeram >>= bitcount;
2439 s.sharedram >>= bitcount;
2440 s.bufferram >>= bitcount;
2441 s.totalswap >>= bitcount;
2442 s.freeswap >>= bitcount;
2443 s.totalhigh >>= bitcount;
2444 s.freehigh >>= bitcount;
2445 }
2446
2447 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
2448 __put_user(s.uptime, &info->uptime) ||
2449 __put_user(s.loads[0], &info->loads[0]) ||
2450 __put_user(s.loads[1], &info->loads[1]) ||
2451 __put_user(s.loads[2], &info->loads[2]) ||
2452 __put_user(s.totalram, &info->totalram) ||
2453 __put_user(s.freeram, &info->freeram) ||
2454 __put_user(s.sharedram, &info->sharedram) ||
2455 __put_user(s.bufferram, &info->bufferram) ||
2456 __put_user(s.totalswap, &info->totalswap) ||
2457 __put_user(s.freeswap, &info->freeswap) ||
2458 __put_user(s.procs, &info->procs) ||
2459 __put_user(s.totalhigh, &info->totalhigh) ||
2460 __put_user(s.freehigh, &info->freehigh) ||
2461 __put_user(s.mem_unit, &info->mem_unit))
2462 return -EFAULT;
2463
2464 return 0;
2465}
2466#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce16..7078052284fd 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl);
20cond_syscall(sys32_quotactl); 20cond_syscall(sys32_quotactl);
21cond_syscall(sys_acct); 21cond_syscall(sys_acct);
22cond_syscall(sys_lookup_dcookie); 22cond_syscall(sys_lookup_dcookie);
23cond_syscall(compat_sys_lookup_dcookie);
23cond_syscall(sys_swapon); 24cond_syscall(sys_swapon);
24cond_syscall(sys_swapoff); 25cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 26cond_syscall(sys_kexec_load);
@@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev);
155cond_syscall(sys_pciconfig_read); 156cond_syscall(sys_pciconfig_read);
156cond_syscall(sys_pciconfig_write); 157cond_syscall(sys_pciconfig_write);
157cond_syscall(sys_pciconfig_iobase); 158cond_syscall(sys_pciconfig_iobase);
158cond_syscall(sys32_ipc); 159cond_syscall(compat_sys_s390_ipc);
159cond_syscall(ppc_rtas); 160cond_syscall(ppc_rtas);
160cond_syscall(sys_spu_run); 161cond_syscall(sys_spu_run);
161cond_syscall(sys_spu_create); 162cond_syscall(sys_spu_create);
@@ -199,6 +200,7 @@ cond_syscall(sys_perf_event_open);
199/* fanotify! */ 200/* fanotify! */
200cond_syscall(sys_fanotify_init); 201cond_syscall(sys_fanotify_init);
201cond_syscall(sys_fanotify_mark); 202cond_syscall(sys_fanotify_mark);
203cond_syscall(compat_sys_fanotify_mark);
202 204
203/* open by handle */ 205/* open by handle */
204cond_syscall(sys_name_to_handle_at); 206cond_syscall(sys_name_to_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
106#endif 106#endif
107extern int pid_max; 107extern int pid_max;
108extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 109extern int percpu_pagelist_fraction;
111extern int compat_log; 110extern int compat_log;
112extern int latencytop_enabled; 111extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
1430 .extra2 = &one, 1429 .extra2 = &one,
1431 }, 1430 },
1432#endif 1431#endif
1432 {
1433 .procname = "user_reserve_kbytes",
1434 .data = &sysctl_user_reserve_kbytes,
1435 .maxlen = sizeof(sysctl_user_reserve_kbytes),
1436 .mode = 0644,
1437 .proc_handler = proc_doulongvec_minmax,
1438 },
1439 {
1440 .procname = "admin_reserve_kbytes",
1441 .data = &sysctl_admin_reserve_kbytes,
1442 .maxlen = sizeof(sysctl_admin_reserve_kbytes),
1443 .mode = 0644,
1444 .proc_handler = proc_doulongvec_minmax,
1445 },
1433 { } 1446 { }
1434}; 1447};
1435 1448
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index ebf72358e86a..aea4a9ea6fc8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -15,6 +15,7 @@
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/compat.h>
18 19
19#ifdef CONFIG_SYSCTL_SYSCALL 20#ifdef CONFIG_SYSCTL_SYSCALL
20 21
@@ -1447,7 +1448,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1447 1448
1448 1449
1449#ifdef CONFIG_COMPAT 1450#ifdef CONFIG_COMPAT
1450#include <asm/compat.h>
1451 1451
1452struct compat_sysctl_args { 1452struct compat_sysctl_args {
1453 compat_uptr_t name; 1453 compat_uptr_t name;
@@ -1459,7 +1459,7 @@ struct compat_sysctl_args {
1459 compat_ulong_t __unused[4]; 1459 compat_ulong_t __unused[4];
1460}; 1460};
1461 1461
1462asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) 1462COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
1463{ 1463{
1464 struct compat_sysctl_args tmp; 1464 struct compat_sysctl_args tmp;
1465 compat_size_t __user *compat_oldlenp; 1465 compat_size_t __user *compat_oldlenp;
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index f8b11a283171..12d6ebbfdd83 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -365,7 +365,7 @@ int init_test_probes(void)
365 target2 = kprobe_target2; 365 target2 = kprobe_target2;
366 366
367 do { 367 do {
368 rand1 = random32(); 368 rand1 = prandom_u32();
369 } while (rand1 <= div_factor); 369 } while (rand1 <= div_factor);
370 370
371 printk(KERN_INFO "Kprobe smoke test started\n"); 371 printk(KERN_INFO "Kprobe smoke test started\n");
diff --git a/kernel/time.c b/kernel/time.c
index f8342a41efa6..d3617dbd3dca 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -138,13 +138,14 @@ int persistent_clock_is_local;
138 */ 138 */
139static inline void warp_clock(void) 139static inline void warp_clock(void)
140{ 140{
141 struct timespec adjust; 141 if (sys_tz.tz_minuteswest != 0) {
142 struct timespec adjust;
142 143
143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1; 144 persistent_clock_is_local = 1;
146 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 145 adjust.tv_sec = sys_tz.tz_minuteswest * 60;
147 do_settimeofday(&adjust); 146 adjust.tv_nsec = 0;
147 timekeeping_inject_offset(&adjust);
148 }
148} 149}
149 150
150/* 151/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..70f27e89012b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Platforms has a persistent clock
16config ALWAYS_USE_PERSISTENT_CLOCK
17 bool
18 default n
19
20# Timekeeping vsyscall support 15# Timekeeping vsyscall support
21config GENERIC_TIME_VSYSCALL 16config GENERIC_TIME_VSYSCALL
22 bool 17 bool
@@ -64,20 +59,88 @@ config GENERIC_CMOS_UPDATE
64if GENERIC_CLOCKEVENTS 59if GENERIC_CLOCKEVENTS
65menu "Timers subsystem" 60menu "Timers subsystem"
66 61
67# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is 62# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
68# only related to the tick functionality. Oneshot clockevent devices 63# only related to the tick functionality. Oneshot clockevent devices
69# are supported independ of this. 64# are supported independ of this.
70config TICK_ONESHOT 65config TICK_ONESHOT
71 bool 66 bool
72 67
73config NO_HZ 68config NO_HZ_COMMON
74 bool "Tickless System (Dynamic Ticks)" 69 bool
75 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 70 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
76 select TICK_ONESHOT 71 select TICK_ONESHOT
72
73choice
74 prompt "Timer tick handling"
75 default NO_HZ_IDLE if NO_HZ
76
77config HZ_PERIODIC
78 bool "Periodic timer ticks (constant rate, no dynticks)"
79 help
80 This option keeps the tick running periodically at a constant
81 rate, even when the CPU doesn't need it.
82
83config NO_HZ_IDLE
84 bool "Idle dynticks system (tickless idle)"
85 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
86 select NO_HZ_COMMON
87 help
88 This option enables a tickless idle system: timer interrupts
89 will only trigger on an as-needed basis when the system is idle.
90 This is usually interesting for energy saving.
91
92 Most of the time you want to say Y here.
93
94config NO_HZ_FULL
95 bool "Full dynticks system (tickless)"
96 # NO_HZ_COMMON dependency
97 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
98 # We need at least one periodic CPU for timekeeping
99 depends on SMP
100 # RCU_USER_QS dependency
101 depends on HAVE_CONTEXT_TRACKING
102 # VIRT_CPU_ACCOUNTING_GEN dependency
103 depends on 64BIT
104 select NO_HZ_COMMON
105 select RCU_USER_QS
106 select RCU_NOCB_CPU
107 select VIRT_CPU_ACCOUNTING_GEN
108 select CONTEXT_TRACKING_FORCE
109 select IRQ_WORK
110 help
111 Adaptively try to shutdown the tick whenever possible, even when
112 the CPU is running tasks. Typically this requires running a single
113 task on the CPU. Chances for running tickless are maximized when
114 the task mostly runs in userspace and has few kernel activity.
115
116 You need to fill up the nohz_full boot parameter with the
117 desired range of dynticks CPUs.
118
119 This is implemented at the expense of some overhead in user <-> kernel
120 transitions: syscalls, exceptions and interrupts. Even when it's
121 dynamically off.
122
123 Say N.
124
125endchoice
126
127config NO_HZ_FULL_ALL
128 bool "Full dynticks system on all CPUs by default"
129 depends on NO_HZ_FULL
130 help
131 If the user doesn't pass the nohz_full boot option to
132 define the range of full dynticks CPUs, consider that all
133 CPUs in the system are full dynticks by default.
134 Note the boot CPU will still be kept outside the range to
135 handle the timekeeping duty.
136
137config NO_HZ
138 bool "Old Idle dynticks config"
139 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
77 help 140 help
78 This option enables a tickless system: timer interrupts will 141 This is the old config entry that enables dynticks idle.
79 only trigger on an as-needed basis both when the system is 142 We keep it around for a little while to enforce backward
80 busy and when the system is idle. 143 compatibility with older config files.
81 144
82config HIGH_RES_TIMERS 145config HIGH_RES_TIMERS
83 bool "High Resolution Timer Support" 146 bool "High Resolution Timer Support"
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7d..8f5b3b98577b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,13 +18,14 @@
18#include <linux/rtc.h> 18#include <linux/rtc.h>
19 19
20#include "tick-internal.h" 20#include "tick-internal.h"
21#include "ntp_internal.h"
21 22
22/* 23/*
23 * NTP timekeeping variables: 24 * NTP timekeeping variables:
25 *
26 * Note: All of the NTP state is protected by the timekeeping locks.
24 */ 27 */
25 28
26DEFINE_RAW_SPINLOCK(ntp_lock);
27
28 29
29/* USER_HZ period (usecs): */ 30/* USER_HZ period (usecs): */
30unsigned long tick_usec = TICK_USEC; 31unsigned long tick_usec = TICK_USEC;
@@ -53,9 +54,6 @@ static int time_state = TIME_OK;
53/* clock status bits: */ 54/* clock status bits: */
54static int time_status = STA_UNSYNC; 55static int time_status = STA_UNSYNC;
55 56
56/* TAI offset (secs): */
57static long time_tai;
58
59/* time adjustment (nsecs): */ 57/* time adjustment (nsecs): */
60static s64 time_offset; 58static s64 time_offset;
61 59
@@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
134 132
135/** 133/**
136 * pps_clear - Clears the PPS state variables 134 * pps_clear - Clears the PPS state variables
137 *
138 * Must be called while holding a write on the ntp_lock
139 */ 135 */
140static inline void pps_clear(void) 136static inline void pps_clear(void)
141{ 137{
@@ -150,8 +146,6 @@ static inline void pps_clear(void)
150/* Decrease pps_valid to indicate that another second has passed since 146/* Decrease pps_valid to indicate that another second has passed since
151 * the last PPS signal. When it reaches 0, indicate that PPS signal is 147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
152 * missing. 148 * missing.
153 *
154 * Must be called while holding a write on the ntp_lock
155 */ 149 */
156static inline void pps_dec_valid(void) 150static inline void pps_dec_valid(void)
157{ 151{
@@ -346,10 +340,6 @@ static void ntp_update_offset(long offset)
346 */ 340 */
347void ntp_clear(void) 341void ntp_clear(void)
348{ 342{
349 unsigned long flags;
350
351 raw_spin_lock_irqsave(&ntp_lock, flags);
352
353 time_adjust = 0; /* stop active adjtime() */ 343 time_adjust = 0; /* stop active adjtime() */
354 time_status |= STA_UNSYNC; 344 time_status |= STA_UNSYNC;
355 time_maxerror = NTP_PHASE_LIMIT; 345 time_maxerror = NTP_PHASE_LIMIT;
@@ -362,20 +352,12 @@ void ntp_clear(void)
362 352
363 /* Clear PPS state variables */ 353 /* Clear PPS state variables */
364 pps_clear(); 354 pps_clear();
365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
366
367} 355}
368 356
369 357
370u64 ntp_tick_length(void) 358u64 ntp_tick_length(void)
371{ 359{
372 unsigned long flags; 360 return tick_length;
373 s64 ret;
374
375 raw_spin_lock_irqsave(&ntp_lock, flags);
376 ret = tick_length;
377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
378 return ret;
379} 361}
380 362
381 363
@@ -393,9 +375,6 @@ int second_overflow(unsigned long secs)
393{ 375{
394 s64 delta; 376 s64 delta;
395 int leap = 0; 377 int leap = 0;
396 unsigned long flags;
397
398 raw_spin_lock_irqsave(&ntp_lock, flags);
399 378
400 /* 379 /*
401 * Leap second processing. If in leap-insert state at the end of the 380 * Leap second processing. If in leap-insert state at the end of the
@@ -415,7 +394,6 @@ int second_overflow(unsigned long secs)
415 else if (secs % 86400 == 0) { 394 else if (secs % 86400 == 0) {
416 leap = -1; 395 leap = -1;
417 time_state = TIME_OOP; 396 time_state = TIME_OOP;
418 time_tai++;
419 printk(KERN_NOTICE 397 printk(KERN_NOTICE
420 "Clock: inserting leap second 23:59:60 UTC\n"); 398 "Clock: inserting leap second 23:59:60 UTC\n");
421 } 399 }
@@ -425,7 +403,6 @@ int second_overflow(unsigned long secs)
425 time_state = TIME_OK; 403 time_state = TIME_OK;
426 else if ((secs + 1) % 86400 == 0) { 404 else if ((secs + 1) % 86400 == 0) {
427 leap = 1; 405 leap = 1;
428 time_tai--;
429 time_state = TIME_WAIT; 406 time_state = TIME_WAIT;
430 printk(KERN_NOTICE 407 printk(KERN_NOTICE
431 "Clock: deleting leap second 23:59:59 UTC\n"); 408 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -479,8 +456,6 @@ int second_overflow(unsigned long secs)
479 time_adjust = 0; 456 time_adjust = 0;
480 457
481out: 458out:
482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
483
484 return leap; 459 return leap;
485} 460}
486 461
@@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
575 time_status |= txc->status & ~STA_RONLY; 550 time_status |= txc->status & ~STA_RONLY;
576} 551}
577 552
578/* 553
579 * Called with ntp_lock held, so we can access and modify 554static inline void process_adjtimex_modes(struct timex *txc,
580 * all the global NTP state: 555 struct timespec *ts,
581 */ 556 s32 *time_tai)
582static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
583{ 557{
584 if (txc->modes & ADJ_STATUS) 558 if (txc->modes & ADJ_STATUS)
585 process_adj_status(txc, ts); 559 process_adj_status(txc, ts);
@@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
613 } 587 }
614 588
615 if (txc->modes & ADJ_TAI && txc->constant > 0) 589 if (txc->modes & ADJ_TAI && txc->constant > 0)
616 time_tai = txc->constant; 590 *time_tai = txc->constant;
617 591
618 if (txc->modes & ADJ_OFFSET) 592 if (txc->modes & ADJ_OFFSET)
619 ntp_update_offset(txc->offset); 593 ntp_update_offset(txc->offset);
@@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
625 ntp_update_frequency(); 599 ntp_update_frequency();
626} 600}
627 601
628/* 602
629 * adjtimex mainly allows reading (and writing, if superuser) of 603
630 * kernel time-keeping variables. used by xntpd. 604/**
605 * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
631 */ 606 */
632int do_adjtimex(struct timex *txc) 607int ntp_validate_timex(struct timex *txc)
633{ 608{
634 struct timespec ts;
635 int result;
636
637 /* Validate the data before disabling interrupts */
638 if (txc->modes & ADJ_ADJTIME) { 609 if (txc->modes & ADJ_ADJTIME) {
639 /* singleshot must not be used with any other mode bits */ 610 /* singleshot must not be used with any other mode bits */
640 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) 611 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc)
646 /* In order to modify anything, you gotta be super-user! */ 617 /* In order to modify anything, you gotta be super-user! */
647 if (txc->modes && !capable(CAP_SYS_TIME)) 618 if (txc->modes && !capable(CAP_SYS_TIME))
648 return -EPERM; 619 return -EPERM;
649
650 /* 620 /*
651 * if the quartz is off by more than 10% then 621 * if the quartz is off by more than 10% then
652 * something is VERY wrong! 622 * something is VERY wrong!
@@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc)
657 return -EINVAL; 627 return -EINVAL;
658 } 628 }
659 629
660 if (txc->modes & ADJ_SETOFFSET) { 630 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
661 struct timespec delta; 631 return -EPERM;
662 delta.tv_sec = txc->time.tv_sec; 632
663 delta.tv_nsec = txc->time.tv_usec; 633 return 0;
664 if (!capable(CAP_SYS_TIME)) 634}
665 return -EPERM;
666 if (!(txc->modes & ADJ_NANO))
667 delta.tv_nsec *= 1000;
668 result = timekeeping_inject_offset(&delta);
669 if (result)
670 return result;
671 }
672 635
673 getnstimeofday(&ts);
674 636
675 raw_spin_lock_irq(&ntp_lock); 637/*
638 * adjtimex mainly allows reading (and writing, if superuser) of
639 * kernel time-keeping variables. used by xntpd.
640 */
641int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
642{
643 int result;
676 644
677 if (txc->modes & ADJ_ADJTIME) { 645 if (txc->modes & ADJ_ADJTIME) {
678 long save_adjust = time_adjust; 646 long save_adjust = time_adjust;
@@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc)
687 655
688 /* If there are input parameters, then process them: */ 656 /* If there are input parameters, then process them: */
689 if (txc->modes) 657 if (txc->modes)
690 process_adjtimex_modes(txc, &ts); 658 process_adjtimex_modes(txc, ts, time_tai);
691 659
692 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 660 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
693 NTP_SCALE_SHIFT); 661 NTP_SCALE_SHIFT);
@@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc)
709 txc->precision = 1; 677 txc->precision = 1;
710 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; 678 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
711 txc->tick = tick_usec; 679 txc->tick = tick_usec;
712 txc->tai = time_tai; 680 txc->tai = *time_tai;
713 681
714 /* fill PPS status fields */ 682 /* fill PPS status fields */
715 pps_fill_timex(txc); 683 pps_fill_timex(txc);
716 684
717 raw_spin_unlock_irq(&ntp_lock); 685 txc->time.tv_sec = ts->tv_sec;
718 686 txc->time.tv_usec = ts->tv_nsec;
719 txc->time.tv_sec = ts.tv_sec;
720 txc->time.tv_usec = ts.tv_nsec;
721 if (!(time_status & STA_NANO)) 687 if (!(time_status & STA_NANO))
722 txc->time.tv_usec /= NSEC_PER_USEC; 688 txc->time.tv_usec /= NSEC_PER_USEC;
723 689
@@ -894,7 +860,7 @@ static void hardpps_update_phase(long error)
894} 860}
895 861
896/* 862/*
897 * hardpps() - discipline CPU clock oscillator to external PPS signal 863 * __hardpps() - discipline CPU clock oscillator to external PPS signal
898 * 864 *
899 * This routine is called at each PPS signal arrival in order to 865 * This routine is called at each PPS signal arrival in order to
900 * discipline the CPU clock oscillator to the PPS signal. It takes two 866 * discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -905,15 +871,12 @@ static void hardpps_update_phase(long error)
905 * This code is based on David Mills's reference nanokernel 871 * This code is based on David Mills's reference nanokernel
906 * implementation. It was mostly rewritten but keeps the same idea. 872 * implementation. It was mostly rewritten but keeps the same idea.
907 */ 873 */
908void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
909{ 875{
910 struct pps_normtime pts_norm, freq_norm; 876 struct pps_normtime pts_norm, freq_norm;
911 unsigned long flags;
912 877
913 pts_norm = pps_normalize_ts(*phase_ts); 878 pts_norm = pps_normalize_ts(*phase_ts);
914 879
915 raw_spin_lock_irqsave(&ntp_lock, flags);
916
917 /* clear the error bits, they will be set again if needed */ 880 /* clear the error bits, they will be set again if needed */
918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 881 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
919 882
@@ -925,7 +888,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
925 * just start the frequency interval */ 888 * just start the frequency interval */
926 if (unlikely(pps_fbase.tv_sec == 0)) { 889 if (unlikely(pps_fbase.tv_sec == 0)) {
927 pps_fbase = *raw_ts; 890 pps_fbase = *raw_ts;
928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
929 return; 891 return;
930 } 892 }
931 893
@@ -940,7 +902,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
940 time_status |= STA_PPSJITTER; 902 time_status |= STA_PPSJITTER;
941 /* restart the frequency calibration interval */ 903 /* restart the frequency calibration interval */
942 pps_fbase = *raw_ts; 904 pps_fbase = *raw_ts;
943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
944 pr_err("hardpps: PPSJITTER: bad pulse\n"); 905 pr_err("hardpps: PPSJITTER: bad pulse\n");
945 return; 906 return;
946 } 907 }
@@ -957,10 +918,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
957 918
958 hardpps_update_phase(pts_norm.nsec); 919 hardpps_update_phase(pts_norm.nsec);
959 920
960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
961} 921}
962EXPORT_SYMBOL(hardpps);
963
964#endif /* CONFIG_NTP_PPS */ 922#endif /* CONFIG_NTP_PPS */
965 923
966static int __init ntp_tick_adj_setup(char *str) 924static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000000..1950cb4ca2a4
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
1#ifndef _LINUX_NTP_INTERNAL_H
2#define _LINUX_NTP_INTERNAL_H
3
4extern void ntp_init(void);
5extern void ntp_clear(void);
6/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
7extern u64 ntp_tick_length(void);
8extern int second_overflow(unsigned long secs);
9extern int ntp_validate_timex(struct timex *);
10extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
11extern void __hardpps(const struct timespec *, const struct timespec *);
12#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8d..0c739423b0f9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,9 +28,8 @@
28 */ 28 */
29 29
30static struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31static cpumask_var_t tick_broadcast_mask;
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static cpumask_var_t tmpmask;
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 33static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 34static int tick_broadcast_force;
36 35
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)
50 49
51struct cpumask *tick_get_broadcast_mask(void) 50struct cpumask *tick_get_broadcast_mask(void)
52{ 51{
53 return to_cpumask(tick_broadcast_mask); 52 return tick_broadcast_mask;
54} 53}
55 54
56/* 55/*
@@ -67,15 +66,30 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
67 */ 66 */
68int tick_check_broadcast_device(struct clock_event_device *dev) 67int tick_check_broadcast_device(struct clock_event_device *dev)
69{ 68{
70 if ((tick_broadcast_device.evtdev && 69 struct clock_event_device *cur = tick_broadcast_device.evtdev;
70
71 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
72 (tick_broadcast_device.evtdev &&
71 tick_broadcast_device.evtdev->rating >= dev->rating) || 73 tick_broadcast_device.evtdev->rating >= dev->rating) ||
72 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 74 (dev->features & CLOCK_EVT_FEAT_C3STOP))
73 return 0; 75 return 0;
74 76
75 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 77 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
78 if (cur)
79 cur->event_handler = clockevents_handle_noop;
76 tick_broadcast_device.evtdev = dev; 80 tick_broadcast_device.evtdev = dev;
77 if (!cpumask_empty(tick_get_broadcast_mask())) 81 if (!cpumask_empty(tick_broadcast_mask))
78 tick_broadcast_start_periodic(dev); 82 tick_broadcast_start_periodic(dev);
83 /*
84 * Inform all cpus about this. We might be in a situation
85 * where we did not switch to oneshot mode because the per cpu
86 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
87 * of a oneshot capable broadcast device. Without that
88 * notification the systems stays stuck in periodic mode
89 * forever.
90 */
91 if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
92 tick_clock_notify();
79 return 1; 93 return 1;
80} 94}
81 95
@@ -123,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
123 if (!tick_device_is_functional(dev)) { 137 if (!tick_device_is_functional(dev)) {
124 dev->event_handler = tick_handle_periodic; 138 dev->event_handler = tick_handle_periodic;
125 tick_device_setup_broadcast_func(dev); 139 tick_device_setup_broadcast_func(dev);
126 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 140 cpumask_set_cpu(cpu, tick_broadcast_mask);
127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 141 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
128 ret = 1; 142 ret = 1;
129 } else { 143 } else {
@@ -134,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
134 */ 148 */
135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 149 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
136 int cpu = smp_processor_id(); 150 int cpu = smp_processor_id();
137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 151 cpumask_clear_cpu(cpu, tick_broadcast_mask);
138 tick_broadcast_clear_oneshot(cpu); 152 tick_broadcast_clear_oneshot(cpu);
139 } else { 153 } else {
140 tick_device_setup_broadcast_func(dev); 154 tick_device_setup_broadcast_func(dev);
@@ -198,9 +212,8 @@ static void tick_do_periodic_broadcast(void)
198{ 212{
199 raw_spin_lock(&tick_broadcast_lock); 213 raw_spin_lock(&tick_broadcast_lock);
200 214
201 cpumask_and(to_cpumask(tmpmask), 215 cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
202 cpu_online_mask, tick_get_broadcast_mask()); 216 tick_do_broadcast(tmpmask);
203 tick_do_broadcast(to_cpumask(tmpmask));
204 217
205 raw_spin_unlock(&tick_broadcast_lock); 218 raw_spin_unlock(&tick_broadcast_lock);
206} 219}
@@ -263,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 if (!tick_device_is_functional(dev)) 276 if (!tick_device_is_functional(dev))
264 goto out; 277 goto out;
265 278
266 bc_stopped = cpumask_empty(tick_get_broadcast_mask()); 279 bc_stopped = cpumask_empty(tick_broadcast_mask);
267 280
268 switch (*reason) { 281 switch (*reason) {
269 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 282 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
270 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 283 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
271 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 284 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
272 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
273 if (tick_broadcast_device.mode == 285 if (tick_broadcast_device.mode ==
274 TICKDEV_MODE_PERIODIC) 286 TICKDEV_MODE_PERIODIC)
275 clockevents_shutdown(dev); 287 clockevents_shutdown(dev);
@@ -279,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
279 break; 291 break;
280 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 292 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
281 if (!tick_broadcast_force && 293 if (!tick_broadcast_force &&
282 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 294 cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
283 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
284 if (tick_broadcast_device.mode == 295 if (tick_broadcast_device.mode ==
285 TICKDEV_MODE_PERIODIC) 296 TICKDEV_MODE_PERIODIC)
286 tick_setup_periodic(dev, 0); 297 tick_setup_periodic(dev, 0);
@@ -288,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
288 break; 299 break;
289 } 300 }
290 301
291 if (cpumask_empty(tick_get_broadcast_mask())) { 302 if (cpumask_empty(tick_broadcast_mask)) {
292 if (!bc_stopped) 303 if (!bc_stopped)
293 clockevents_shutdown(bc); 304 clockevents_shutdown(bc);
294 } else if (bc_stopped) { 305 } else if (bc_stopped) {
@@ -337,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
337 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 348 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
338 349
339 bc = tick_broadcast_device.evtdev; 350 bc = tick_broadcast_device.evtdev;
340 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 351 cpumask_clear_cpu(cpu, tick_broadcast_mask);
341 352
342 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 353 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
343 if (bc && cpumask_empty(tick_get_broadcast_mask())) 354 if (bc && cpumask_empty(tick_broadcast_mask))
344 clockevents_shutdown(bc); 355 clockevents_shutdown(bc);
345 } 356 }
346 357
@@ -376,13 +387,13 @@ int tick_resume_broadcast(void)
376 387
377 switch (tick_broadcast_device.mode) { 388 switch (tick_broadcast_device.mode) {
378 case TICKDEV_MODE_PERIODIC: 389 case TICKDEV_MODE_PERIODIC:
379 if (!cpumask_empty(tick_get_broadcast_mask())) 390 if (!cpumask_empty(tick_broadcast_mask))
380 tick_broadcast_start_periodic(bc); 391 tick_broadcast_start_periodic(bc);
381 broadcast = cpumask_test_cpu(smp_processor_id(), 392 broadcast = cpumask_test_cpu(smp_processor_id(),
382 tick_get_broadcast_mask()); 393 tick_broadcast_mask);
383 break; 394 break;
384 case TICKDEV_MODE_ONESHOT: 395 case TICKDEV_MODE_ONESHOT:
385 if (!cpumask_empty(tick_get_broadcast_mask())) 396 if (!cpumask_empty(tick_broadcast_mask))
386 broadcast = tick_resume_broadcast_oneshot(bc); 397 broadcast = tick_resume_broadcast_oneshot(bc);
387 break; 398 break;
388 } 399 }
@@ -395,25 +406,58 @@ int tick_resume_broadcast(void)
395 406
396#ifdef CONFIG_TICK_ONESHOT 407#ifdef CONFIG_TICK_ONESHOT
397 408
398/* FIXME: use cpumask_var_t. */ 409static cpumask_var_t tick_broadcast_oneshot_mask;
399static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); 410static cpumask_var_t tick_broadcast_pending_mask;
411static cpumask_var_t tick_broadcast_force_mask;
400 412
401/* 413/*
402 * Exposed for debugging: see timer_list.c 414 * Exposed for debugging: see timer_list.c
403 */ 415 */
404struct cpumask *tick_get_broadcast_oneshot_mask(void) 416struct cpumask *tick_get_broadcast_oneshot_mask(void)
405{ 417{
406 return to_cpumask(tick_broadcast_oneshot_mask); 418 return tick_broadcast_oneshot_mask;
407} 419}
408 420
409static int tick_broadcast_set_event(ktime_t expires, int force) 421/*
422 * Called before going idle with interrupts disabled. Checks whether a
423 * broadcast event from the other core is about to happen. We detected
424 * that in tick_broadcast_oneshot_control(). The callsite can use this
425 * to avoid a deep idle transition as we are about to get the
426 * broadcast IPI right away.
427 */
428int tick_check_broadcast_expired(void)
410{ 429{
411 struct clock_event_device *bc = tick_broadcast_device.evtdev; 430 return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
431}
432
433/*
434 * Set broadcast interrupt affinity
435 */
436static void tick_broadcast_set_affinity(struct clock_event_device *bc,
437 const struct cpumask *cpumask)
438{
439 if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
440 return;
441
442 if (cpumask_equal(bc->cpumask, cpumask))
443 return;
444
445 bc->cpumask = cpumask;
446 irq_set_affinity(bc->irq, bc->cpumask);
447}
448
449static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
450 ktime_t expires, int force)
451{
452 int ret;
412 453
413 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 454 if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
414 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 455 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
415 456
416 return clockevents_program_event(bc, expires, force); 457 ret = clockevents_program_event(bc, expires, force);
458 if (!ret)
459 tick_broadcast_set_affinity(bc, cpumask_of(cpu));
460 return ret;
417} 461}
418 462
419int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 463int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -428,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
428 */ 472 */
429void tick_check_oneshot_broadcast(int cpu) 473void tick_check_oneshot_broadcast(int cpu)
430{ 474{
431 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { 475 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
432 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 476 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
433 477
434 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 478 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -442,27 +486,45 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
442{ 486{
443 struct tick_device *td; 487 struct tick_device *td;
444 ktime_t now, next_event; 488 ktime_t now, next_event;
445 int cpu; 489 int cpu, next_cpu = 0;
446 490
447 raw_spin_lock(&tick_broadcast_lock); 491 raw_spin_lock(&tick_broadcast_lock);
448again: 492again:
449 dev->next_event.tv64 = KTIME_MAX; 493 dev->next_event.tv64 = KTIME_MAX;
450 next_event.tv64 = KTIME_MAX; 494 next_event.tv64 = KTIME_MAX;
451 cpumask_clear(to_cpumask(tmpmask)); 495 cpumask_clear(tmpmask);
452 now = ktime_get(); 496 now = ktime_get();
453 /* Find all expired events */ 497 /* Find all expired events */
454 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { 498 for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
455 td = &per_cpu(tick_cpu_device, cpu); 499 td = &per_cpu(tick_cpu_device, cpu);
456 if (td->evtdev->next_event.tv64 <= now.tv64) 500 if (td->evtdev->next_event.tv64 <= now.tv64) {
457 cpumask_set_cpu(cpu, to_cpumask(tmpmask)); 501 cpumask_set_cpu(cpu, tmpmask);
458 else if (td->evtdev->next_event.tv64 < next_event.tv64) 502 /*
503 * Mark the remote cpu in the pending mask, so
504 * it can avoid reprogramming the cpu local
505 * timer in tick_broadcast_oneshot_control().
506 */
507 cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
508 } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
459 next_event.tv64 = td->evtdev->next_event.tv64; 509 next_event.tv64 = td->evtdev->next_event.tv64;
510 next_cpu = cpu;
511 }
460 } 512 }
461 513
462 /* 514 /*
515 * Remove the current cpu from the pending mask. The event is
516 * delivered immediately in tick_do_broadcast() !
517 */
518 cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
519
520 /* Take care of enforced broadcast requests */
521 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
522 cpumask_clear(tick_broadcast_force_mask);
523
524 /*
463 * Wakeup the cpus which have an expired event. 525 * Wakeup the cpus which have an expired event.
464 */ 526 */
465 tick_do_broadcast(to_cpumask(tmpmask)); 527 tick_do_broadcast(tmpmask);
466 528
467 /* 529 /*
468 * Two reasons for reprogram: 530 * Two reasons for reprogram:
@@ -479,7 +541,7 @@ again:
479 * Rearm the broadcast device. If event expired, 541 * Rearm the broadcast device. If event expired,
480 * repeat the above 542 * repeat the above
481 */ 543 */
482 if (tick_broadcast_set_event(next_event, 0)) 544 if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
483 goto again; 545 goto again;
484 } 546 }
485 raw_spin_unlock(&tick_broadcast_lock); 547 raw_spin_unlock(&tick_broadcast_lock);
@@ -494,6 +556,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
494 struct clock_event_device *bc, *dev; 556 struct clock_event_device *bc, *dev;
495 struct tick_device *td; 557 struct tick_device *td;
496 unsigned long flags; 558 unsigned long flags;
559 ktime_t now;
497 int cpu; 560 int cpu;
498 561
499 /* 562 /*
@@ -518,21 +581,84 @@ void tick_broadcast_oneshot_control(unsigned long reason)
518 581
519 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 582 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
520 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 583 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
521 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 584 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
522 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 585 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
523 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 586 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
524 if (dev->next_event.tv64 < bc->next_event.tv64) 587 /*
525 tick_broadcast_set_event(dev->next_event, 1); 588 * We only reprogram the broadcast timer if we
589 * did not mark ourself in the force mask and
590 * if the cpu local event is earlier than the
591 * broadcast event. If the current CPU is in
592 * the force mask, then we are going to be
593 * woken by the IPI right away.
594 */
595 if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
596 dev->next_event.tv64 < bc->next_event.tv64)
597 tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
526 } 598 }
527 } else { 599 } else {
528 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 600 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
529 cpumask_clear_cpu(cpu,
530 tick_get_broadcast_oneshot_mask());
531 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 601 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
532 if (dev->next_event.tv64 != KTIME_MAX) 602 if (dev->next_event.tv64 == KTIME_MAX)
533 tick_program_event(dev->next_event, 1); 603 goto out;
604 /*
605 * The cpu which was handling the broadcast
606 * timer marked this cpu in the broadcast
607 * pending mask and fired the broadcast
608 * IPI. So we are going to handle the expired
609 * event anyway via the broadcast IPI
610 * handler. No need to reprogram the timer
611 * with an already expired event.
612 */
613 if (cpumask_test_and_clear_cpu(cpu,
614 tick_broadcast_pending_mask))
615 goto out;
616
617 /*
618 * If the pending bit is not set, then we are
619 * either the CPU handling the broadcast
620 * interrupt or we got woken by something else.
621 *
622 * We are not longer in the broadcast mask, so
623 * if the cpu local expiry time is already
624 * reached, we would reprogram the cpu local
625 * timer with an already expired event.
626 *
627 * This can lead to a ping-pong when we return
628 * to idle and therefor rearm the broadcast
629 * timer before the cpu local timer was able
630 * to fire. This happens because the forced
631 * reprogramming makes sure that the event
632 * will happen in the future and depending on
633 * the min_delta setting this might be far
634 * enough out that the ping-pong starts.
635 *
636 * If the cpu local next_event has expired
637 * then we know that the broadcast timer
638 * next_event has expired as well and
639 * broadcast is about to be handled. So we
640 * avoid reprogramming and enforce that the
641 * broadcast handler, which did not run yet,
642 * will invoke the cpu local handler.
643 *
644 * We cannot call the handler directly from
645 * here, because we might be in a NOHZ phase
646 * and we did not go through the irq_enter()
647 * nohz fixups.
648 */
649 now = ktime_get();
650 if (dev->next_event.tv64 <= now.tv64) {
651 cpumask_set_cpu(cpu, tick_broadcast_force_mask);
652 goto out;
653 }
654 /*
655 * We got woken by something else. Reprogram
656 * the cpu local timer device.
657 */
658 tick_program_event(dev->next_event, 1);
534 } 659 }
535 } 660 }
661out:
536 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 662 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
537} 663}
538 664
@@ -543,7 +669,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
543 */ 669 */
544static void tick_broadcast_clear_oneshot(int cpu) 670static void tick_broadcast_clear_oneshot(int cpu)
545{ 671{
546 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 672 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
547} 673}
548 674
549static void tick_broadcast_init_next_event(struct cpumask *mask, 675static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -573,7 +699,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
573 bc->event_handler = tick_handle_oneshot_broadcast; 699 bc->event_handler = tick_handle_oneshot_broadcast;
574 700
575 /* Take the do_timer update */ 701 /* Take the do_timer update */
576 tick_do_timer_cpu = cpu; 702 if (!tick_nohz_full_cpu(cpu))
703 tick_do_timer_cpu = cpu;
577 704
578 /* 705 /*
579 * We must be careful here. There might be other CPUs 706 * We must be careful here. There might be other CPUs
@@ -581,17 +708,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
581 * oneshot_mask bits for those and program the 708 * oneshot_mask bits for those and program the
582 * broadcast device to fire. 709 * broadcast device to fire.
583 */ 710 */
584 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); 711 cpumask_copy(tmpmask, tick_broadcast_mask);
585 cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); 712 cpumask_clear_cpu(cpu, tmpmask);
586 cpumask_or(tick_get_broadcast_oneshot_mask(), 713 cpumask_or(tick_broadcast_oneshot_mask,
587 tick_get_broadcast_oneshot_mask(), 714 tick_broadcast_oneshot_mask, tmpmask);
588 to_cpumask(tmpmask));
589 715
590 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 716 if (was_periodic && !cpumask_empty(tmpmask)) {
591 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 717 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
592 tick_broadcast_init_next_event(to_cpumask(tmpmask), 718 tick_broadcast_init_next_event(tmpmask,
593 tick_next_period); 719 tick_next_period);
594 tick_broadcast_set_event(tick_next_period, 1); 720 tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
595 } else 721 } else
596 bc->next_event.tv64 = KTIME_MAX; 722 bc->next_event.tv64 = KTIME_MAX;
597 } else { 723 } else {
@@ -639,7 +765,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
639 * Clear the broadcast mask flag for the dead cpu, but do not 765 * Clear the broadcast mask flag for the dead cpu, but do not
640 * stop the broadcast device! 766 * stop the broadcast device!
641 */ 767 */
642 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 768 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
643 769
644 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 770 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
645} 771}
@@ -663,3 +789,14 @@ bool tick_broadcast_oneshot_available(void)
663} 789}
664 790
665#endif 791#endif
792
793void __init tick_broadcast_init(void)
794{
795 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
796 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
797#ifdef CONFIG_TICK_ONESHOT
798 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
799 zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
800 zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
801#endif
802}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f4..5d3fb100bc06 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,
163 * this cpu: 163 * this cpu:
164 */ 164 */
165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { 165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
166 tick_do_timer_cpu = cpu; 166 if (!tick_nohz_full_cpu(cpu))
167 tick_do_timer_cpu = cpu;
168 else
169 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
167 tick_next_period = ktime_get(); 170 tick_next_period = ktime_get();
168 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 171 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
169 } 172 }
@@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup)
323 */ 326 */
324 dev->mode = CLOCK_EVT_MODE_UNUSED; 327 dev->mode = CLOCK_EVT_MODE_UNUSED;
325 clockevents_exchange_device(dev, NULL); 328 clockevents_exchange_device(dev, NULL);
329 dev->event_handler = clockevents_handle_noop;
326 td->evtdev = NULL; 330 td->evtdev = NULL;
327 } 331 }
328 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 332 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
@@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = {
416void __init tick_init(void) 420void __init tick_init(void)
417{ 421{
418 clockevents_register_notifier(&tick_notifier); 422 clockevents_register_notifier(&tick_notifier);
423 tick_broadcast_init();
419} 424}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..f0299eae4602 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
4#include <linux/hrtimer.h> 4#include <linux/hrtimer.h>
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7extern seqlock_t jiffies_lock;
8
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD 9#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
8 10
9#define TICK_DO_TIMER_NONE -1 11#define TICK_DO_TIMER_NONE -1
@@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
94extern void tick_shutdown_broadcast(unsigned int *cpup); 96extern void tick_shutdown_broadcast(unsigned int *cpup);
95extern void tick_suspend_broadcast(void); 97extern void tick_suspend_broadcast(void);
96extern int tick_resume_broadcast(void); 98extern int tick_resume_broadcast(void);
97 99extern void tick_broadcast_init(void);
98extern void 100extern void
99tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); 101tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
100 102
@@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
119static inline void tick_shutdown_broadcast(unsigned int *cpup) { } 121static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
120static inline void tick_suspend_broadcast(void) { } 122static inline void tick_suspend_broadcast(void) { }
121static inline int tick_resume_broadcast(void) { return 0; } 123static inline int tick_resume_broadcast(void) { return 0; }
124static inline void tick_broadcast_init(void) { }
122 125
123/* 126/*
124 * Set the periodic handler in non broadcast mode 127 * Set the periodic handler in non broadcast mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..f4208138fbf4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,11 +21,15 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h>
25#include <linux/perf_event.h>
24 26
25#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
26 28
27#include "tick-internal.h" 29#include "tick-internal.h"
28 30
31#include <trace/events/timer.h>
32
29/* 33/*
30 * Per cpu nohz control structure 34 * Per cpu nohz control structure
31 */ 35 */
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)
104{ 108{
105 int cpu = smp_processor_id(); 109 int cpu = smp_processor_id();
106 110
107#ifdef CONFIG_NO_HZ 111#ifdef CONFIG_NO_HZ_COMMON
108 /* 112 /*
109 * Check if the do_timer duty was dropped. We don't care about 113 * Check if the do_timer duty was dropped. We don't care about
110 * concurrency: This happens only when the cpu in charge went 114 * concurrency: This happens only when the cpu in charge went
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
112 * this duty, then the jiffies update is still serialized by 116 * this duty, then the jiffies update is still serialized by
113 * jiffies_lock. 117 * jiffies_lock.
114 */ 118 */
115 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) 119 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
120 && !tick_nohz_full_cpu(cpu))
116 tick_do_timer_cpu = cpu; 121 tick_do_timer_cpu = cpu;
117#endif 122#endif
118 123
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)
123 128
124static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 129static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
125{ 130{
126#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ_COMMON
127 /* 132 /*
128 * When we are idle and the tick is stopped, we have to touch 133 * When we are idle and the tick is stopped, we have to touch
129 * the watchdog as we might not schedule for a really long 134 * the watchdog as we might not schedule for a really long
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
142 profile_tick(CPU_PROFILING); 147 profile_tick(CPU_PROFILING);
143} 148}
144 149
150#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask;
152bool have_nohz_full_mask;
153
154static bool can_stop_full_tick(void)
155{
156 WARN_ON_ONCE(!irqs_disabled());
157
158 if (!sched_can_stop_tick()) {
159 trace_tick_stop(0, "more than 1 task in runqueue\n");
160 return false;
161 }
162
163 if (!posix_cpu_timers_can_stop_tick(current)) {
164 trace_tick_stop(0, "posix timers running\n");
165 return false;
166 }
167
168 if (!perf_event_can_stop_tick()) {
169 trace_tick_stop(0, "perf events running\n");
170 return false;
171 }
172
173 /* sched_clock_tick() needs us? */
174#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
175 /*
176 * TODO: kick full dynticks CPUs when
177 * sched_clock_stable is set.
178 */
179 if (!sched_clock_stable) {
180 trace_tick_stop(0, "unstable sched clock\n");
181 return false;
182 }
183#endif
184
185 return true;
186}
187
188static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
189
190/*
191 * Re-evaluate the need for the tick on the current CPU
192 * and restart it if necessary.
193 */
194void tick_nohz_full_check(void)
195{
196 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
197
198 if (tick_nohz_full_cpu(smp_processor_id())) {
199 if (ts->tick_stopped && !is_idle_task(current)) {
200 if (!can_stop_full_tick())
201 tick_nohz_restart_sched_tick(ts, ktime_get());
202 }
203 }
204}
205
206static void nohz_full_kick_work_func(struct irq_work *work)
207{
208 tick_nohz_full_check();
209}
210
211static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
212 .func = nohz_full_kick_work_func,
213};
214
215/*
216 * Kick the current CPU if it's full dynticks in order to force it to
217 * re-evaluate its dependency on the tick and restart it if necessary.
218 */
219void tick_nohz_full_kick(void)
220{
221 if (tick_nohz_full_cpu(smp_processor_id()))
222 irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
223}
224
225static void nohz_full_kick_ipi(void *info)
226{
227 tick_nohz_full_check();
228}
229
230/*
231 * Kick all full dynticks CPUs in order to force these to re-evaluate
232 * their dependency on the tick and restart it if necessary.
233 */
234void tick_nohz_full_kick_all(void)
235{
236 if (!have_nohz_full_mask)
237 return;
238
239 preempt_disable();
240 smp_call_function_many(nohz_full_mask,
241 nohz_full_kick_ipi, NULL, false);
242 preempt_enable();
243}
244
245/*
246 * Re-evaluate the need for the tick as we switch the current task.
247 * It might need the tick due to per task/process properties:
248 * perf events, posix cpu timers, ...
249 */
250void tick_nohz_task_switch(struct task_struct *tsk)
251{
252 unsigned long flags;
253
254 local_irq_save(flags);
255
256 if (!tick_nohz_full_cpu(smp_processor_id()))
257 goto out;
258
259 if (tick_nohz_tick_stopped() && !can_stop_full_tick())
260 tick_nohz_full_kick();
261
262out:
263 local_irq_restore(flags);
264}
265
266int tick_nohz_full_cpu(int cpu)
267{
268 if (!have_nohz_full_mask)
269 return 0;
270
271 return cpumask_test_cpu(cpu, nohz_full_mask);
272}
273
274/* Parse the boot-time nohz CPU list from the kernel parameters. */
275static int __init tick_nohz_full_setup(char *str)
276{
277 int cpu;
278
279 alloc_bootmem_cpumask_var(&nohz_full_mask);
280 if (cpulist_parse(str, nohz_full_mask) < 0) {
281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
282 return 1;
283 }
284
285 cpu = smp_processor_id();
286 if (cpumask_test_cpu(cpu, nohz_full_mask)) {
287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
288 cpumask_clear_cpu(cpu, nohz_full_mask);
289 }
290 have_nohz_full_mask = true;
291
292 return 1;
293}
294__setup("nohz_full=", tick_nohz_full_setup);
295
296static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
297 unsigned long action,
298 void *hcpu)
299{
300 unsigned int cpu = (unsigned long)hcpu;
301
302 switch (action & ~CPU_TASKS_FROZEN) {
303 case CPU_DOWN_PREPARE:
304 /*
305 * If we handle the timekeeping duty for full dynticks CPUs,
306 * we can't safely shutdown that CPU.
307 */
308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
309 return -EINVAL;
310 break;
311 }
312 return NOTIFY_OK;
313}
314
315/*
316 * Worst case string length in chunks of CPU range seems 2 steps
317 * separations: 0,2,4,6,...
318 * This is NR_CPUS + sizeof('\0')
319 */
320static char __initdata nohz_full_buf[NR_CPUS + 1];
321
322static int tick_nohz_init_all(void)
323{
324 int err = -1;
325
326#ifdef CONFIG_NO_HZ_FULL_ALL
327 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
329 return err;
330 }
331 err = 0;
332 cpumask_setall(nohz_full_mask);
333 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
334 have_nohz_full_mask = true;
335#endif
336 return err;
337}
338
339void __init tick_nohz_init(void)
340{
341 int cpu;
342
343 if (!have_nohz_full_mask) {
344 if (tick_nohz_init_all() < 0)
345 return;
346 }
347
348 cpu_notifier(tick_nohz_cpu_down_callback, 0);
349
350 /* Make sure full dynticks CPU are also RCU nocbs */
351 for_each_cpu(cpu, nohz_full_mask) {
352 if (!rcu_is_nocb_cpu(cpu)) {
353 pr_warning("NO_HZ: CPU %d is not RCU nocb: "
354 "cleared from nohz_full range", cpu);
355 cpumask_clear_cpu(cpu, nohz_full_mask);
356 }
357 }
358
359 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
360 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
361}
362#else
363#define have_nohz_full_mask (0)
364#endif
365
145/* 366/*
146 * NOHZ - aka dynamic tick functionality 367 * NOHZ - aka dynamic tick functionality
147 */ 368 */
148#ifdef CONFIG_NO_HZ 369#ifdef CONFIG_NO_HZ_COMMON
149/* 370/*
150 * NO HZ enabled ? 371 * NO HZ enabled ?
151 */ 372 */
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
345 delta_jiffies = rcu_delta_jiffies; 566 delta_jiffies = rcu_delta_jiffies;
346 } 567 }
347 } 568 }
569
348 /* 570 /*
349 * Do not stop the tick, if we are only one off 571 * Do not stop the tick, if we are only one off (or less)
350 * or if the cpu is required for rcu 572 * or if the cpu is required for RCU:
351 */ 573 */
352 if (!ts->tick_stopped && delta_jiffies == 1) 574 if (!ts->tick_stopped && delta_jiffies <= 1)
353 goto out; 575 goto out;
354 576
355 /* Schedule the tick, if we are at least one jiffie off */ 577 /* Schedule the tick, if we are at least one jiffie off */
@@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
378 time_delta = KTIME_MAX; 600 time_delta = KTIME_MAX;
379 } 601 }
380 602
603#ifdef CONFIG_NO_HZ_FULL
604 if (!ts->inidle) {
605 time_delta = min(time_delta,
606 scheduler_tick_max_deferment());
607 }
608#endif
609
381 /* 610 /*
382 * calculate the expiry time for the next timer wheel 611 * calculate the expiry time for the next timer wheel
383 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals 612 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
421 650
422 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 651 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
423 ts->tick_stopped = 1; 652 ts->tick_stopped = 1;
653 trace_tick_stop(1, " ");
424 } 654 }
425 655
426 /* 656 /*
@@ -457,6 +687,24 @@ out:
457 return ret; 687 return ret;
458} 688}
459 689
690static void tick_nohz_full_stop_tick(struct tick_sched *ts)
691{
692#ifdef CONFIG_NO_HZ_FULL
693 int cpu = smp_processor_id();
694
695 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
696 return;
697
698 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
699 return;
700
701 if (!can_stop_full_tick())
702 return;
703
704 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
705#endif
706}
707
460static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 708static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
461{ 709{
462 /* 710 /*
@@ -469,6 +717,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
469 if (unlikely(!cpu_online(cpu))) { 717 if (unlikely(!cpu_online(cpu))) {
470 if (cpu == tick_do_timer_cpu) 718 if (cpu == tick_do_timer_cpu)
471 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 719 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
720 return false;
472 } 721 }
473 722
474 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 723 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -482,13 +731,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
482 731
483 if (ratelimit < 10 && 732 if (ratelimit < 10 &&
484 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 733 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
485 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 734 pr_warn("NOHZ: local_softirq_pending %02x\n",
486 (unsigned int) local_softirq_pending()); 735 (unsigned int) local_softirq_pending());
487 ratelimit++; 736 ratelimit++;
488 } 737 }
489 return false; 738 return false;
490 } 739 }
491 740
741 if (have_nohz_full_mask) {
742 /*
743 * Keep the tick alive to guarantee timekeeping progression
744 * if there are full dynticks CPUs around
745 */
746 if (tick_do_timer_cpu == cpu)
747 return false;
748 /*
749 * Boot safety: make sure the timekeeping duty has been
750 * assigned before entering dyntick-idle mode,
751 */
752 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
753 return false;
754 }
755
492 return true; 756 return true;
493} 757}
494 758
@@ -568,12 +832,13 @@ void tick_nohz_irq_exit(void)
568{ 832{
569 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 833 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
570 834
571 if (!ts->inidle) 835 if (ts->inidle) {
572 return; 836 /* Cancel the timer because CPU already waken up from the C-states*/
573 837 menu_hrtimer_cancel();
574 /* Cancel the timer because CPU already waken up from the C-states*/ 838 __tick_nohz_idle_enter(ts);
575 menu_hrtimer_cancel(); 839 } else {
576 __tick_nohz_idle_enter(ts); 840 tick_nohz_full_stop_tick(ts);
841 }
577} 842}
578 843
579/** 844/**
@@ -802,7 +1067,7 @@ static inline void tick_check_nohz(int cpu)
802static inline void tick_nohz_switch_to_nohz(void) { } 1067static inline void tick_nohz_switch_to_nohz(void) { }
803static inline void tick_check_nohz(int cpu) { } 1068static inline void tick_check_nohz(int cpu) { }
804 1069
805#endif /* NO_HZ */ 1070#endif /* CONFIG_NO_HZ_COMMON */
806 1071
807/* 1072/*
808 * Called from irq_enter to notify about the possible interruption of idle() 1073 * Called from irq_enter to notify about the possible interruption of idle()
@@ -887,14 +1152,14 @@ void tick_setup_sched_timer(void)
887 now = ktime_get(); 1152 now = ktime_get();
888 } 1153 }
889 1154
890#ifdef CONFIG_NO_HZ 1155#ifdef CONFIG_NO_HZ_COMMON
891 if (tick_nohz_enabled) 1156 if (tick_nohz_enabled)
892 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1157 ts->nohz_mode = NOHZ_MODE_HIGHRES;
893#endif 1158#endif
894} 1159}
895#endif /* HIGH_RES_TIMERS */ 1160#endif /* HIGH_RES_TIMERS */
896 1161
897#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS 1162#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
898void tick_cancel_sched_timer(int cpu) 1163void tick_cancel_sched_timer(int cpu)
899{ 1164{
900 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1165 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -904,7 +1169,7 @@ void tick_cancel_sched_timer(int cpu)
904 hrtimer_cancel(&ts->sched_timer); 1169 hrtimer_cancel(&ts->sched_timer);
905# endif 1170# endif
906 1171
907 ts->nohz_mode = NOHZ_MODE_INACTIVE; 1172 memset(ts, 0, sizeof(*ts));
908} 1173}
909#endif 1174#endif
910 1175
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0bc98fbe1d..baeeb5c87cf1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,8 +23,13 @@
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h> 24#include <linux/pvclock_gtod.h>
25 25
26#include "tick-internal.h"
27#include "ntp_internal.h"
26 28
27static struct timekeeper timekeeper; 29static struct timekeeper timekeeper;
30static DEFINE_RAW_SPINLOCK(timekeeper_lock);
31static seqcount_t timekeeper_seq;
32static struct timekeeper shadow_timekeeper;
28 33
29/* flag for if timekeeping is suspended */ 34/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended; 35int __read_mostly timekeeping_suspended;
@@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
67 tk->wall_to_monotonic = wtm; 72 tk->wall_to_monotonic = wtm;
68 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 73 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
69 tk->offs_real = timespec_to_ktime(tmp); 74 tk->offs_real = timespec_to_ktime(tmp);
75 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
70} 76}
71 77
72static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 78static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
96 102
97 old_clock = tk->clock; 103 old_clock = tk->clock;
98 tk->clock = clock; 104 tk->clock = clock;
99 clock->cycle_last = clock->read(clock); 105 tk->cycle_last = clock->cycle_last = clock->read(clock);
100 106
101 /* Do the ns -> cycle conversion first, using original mult */ 107 /* Do the ns -> cycle conversion first, using original mult */
102 tmp = NTP_INTERVAL_LENGTH; 108 tmp = NTP_INTERVAL_LENGTH;
@@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
201 207
202/** 208/**
203 * pvclock_gtod_register_notifier - register a pvclock timedata update listener 209 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
204 *
205 * Must hold write on timekeeper.lock
206 */ 210 */
207int pvclock_gtod_register_notifier(struct notifier_block *nb) 211int pvclock_gtod_register_notifier(struct notifier_block *nb)
208{ 212{
@@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
210 unsigned long flags; 214 unsigned long flags;
211 int ret; 215 int ret;
212 216
213 write_seqlock_irqsave(&tk->lock, flags); 217 raw_spin_lock_irqsave(&timekeeper_lock, flags);
214 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 218 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
215 /* update timekeeping data */
216 update_pvclock_gtod(tk); 219 update_pvclock_gtod(tk);
217 write_sequnlock_irqrestore(&tk->lock, flags); 220 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
218 221
219 return ret; 222 return ret;
220} 223}
@@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
223/** 226/**
224 * pvclock_gtod_unregister_notifier - unregister a pvclock 227 * pvclock_gtod_unregister_notifier - unregister a pvclock
225 * timedata update listener 228 * timedata update listener
226 *
227 * Must hold write on timekeeper.lock
228 */ 229 */
229int pvclock_gtod_unregister_notifier(struct notifier_block *nb) 230int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
230{ 231{
231 struct timekeeper *tk = &timekeeper;
232 unsigned long flags; 232 unsigned long flags;
233 int ret; 233 int ret;
234 234
235 write_seqlock_irqsave(&tk->lock, flags); 235 raw_spin_lock_irqsave(&timekeeper_lock, flags);
236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); 236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
237 write_sequnlock_irqrestore(&tk->lock, flags); 237 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
238 238
239 return ret; 239 return ret;
240} 240}
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
242 242
243/* must hold write on timekeeper.lock */ 243/* must hold timekeeper_lock */
244static void timekeeping_update(struct timekeeper *tk, bool clearntp) 244static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
245{ 245{
246 if (clearntp) { 246 if (clearntp) {
247 tk->ntp_error = 0; 247 tk->ntp_error = 0;
@@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
249 } 249 }
250 update_vsyscall(tk); 250 update_vsyscall(tk);
251 update_pvclock_gtod(tk); 251 update_pvclock_gtod(tk);
252
253 if (mirror)
254 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
252} 255}
253 256
254/** 257/**
@@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
267 clock = tk->clock; 270 clock = tk->clock;
268 cycle_now = clock->read(clock); 271 cycle_now = clock->read(clock);
269 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 272 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
270 clock->cycle_last = cycle_now; 273 tk->cycle_last = clock->cycle_last = cycle_now;
271 274
272 tk->xtime_nsec += cycle_delta * tk->mult; 275 tk->xtime_nsec += cycle_delta * tk->mult;
273 276
@@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts)
294 s64 nsecs = 0; 297 s64 nsecs = 0;
295 298
296 do { 299 do {
297 seq = read_seqbegin(&tk->lock); 300 seq = read_seqcount_begin(&timekeeper_seq);
298 301
299 ts->tv_sec = tk->xtime_sec; 302 ts->tv_sec = tk->xtime_sec;
300 nsecs = timekeeping_get_ns(tk); 303 nsecs = timekeeping_get_ns(tk);
301 304
302 } while (read_seqretry(&tk->lock, seq)); 305 } while (read_seqcount_retry(&timekeeper_seq, seq));
303 306
304 ts->tv_nsec = 0; 307 ts->tv_nsec = 0;
305 timespec_add_ns(ts, nsecs); 308 timespec_add_ns(ts, nsecs);
@@ -335,11 +338,11 @@ ktime_t ktime_get(void)
335 WARN_ON(timekeeping_suspended); 338 WARN_ON(timekeeping_suspended);
336 339
337 do { 340 do {
338 seq = read_seqbegin(&tk->lock); 341 seq = read_seqcount_begin(&timekeeper_seq);
339 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 342 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
340 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 343 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
341 344
342 } while (read_seqretry(&tk->lock, seq)); 345 } while (read_seqcount_retry(&timekeeper_seq, seq));
343 /* 346 /*
344 * Use ktime_set/ktime_add_ns to create a proper ktime on 347 * Use ktime_set/ktime_add_ns to create a proper ktime on
345 * 32-bit architectures without CONFIG_KTIME_SCALAR. 348 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts)
366 WARN_ON(timekeeping_suspended); 369 WARN_ON(timekeeping_suspended);
367 370
368 do { 371 do {
369 seq = read_seqbegin(&tk->lock); 372 seq = read_seqcount_begin(&timekeeper_seq);
370 ts->tv_sec = tk->xtime_sec; 373 ts->tv_sec = tk->xtime_sec;
371 nsec = timekeeping_get_ns(tk); 374 nsec = timekeeping_get_ns(tk);
372 tomono = tk->wall_to_monotonic; 375 tomono = tk->wall_to_monotonic;
373 376
374 } while (read_seqretry(&tk->lock, seq)); 377 } while (read_seqcount_retry(&timekeeper_seq, seq));
375 378
376 ts->tv_sec += tomono.tv_sec; 379 ts->tv_sec += tomono.tv_sec;
377 ts->tv_nsec = 0; 380 ts->tv_nsec = 0;
@@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts)
379} 382}
380EXPORT_SYMBOL_GPL(ktime_get_ts); 383EXPORT_SYMBOL_GPL(ktime_get_ts);
381 384
385
386/**
387 * timekeeping_clocktai - Returns the TAI time of day in a timespec
388 * @ts: pointer to the timespec to be set
389 *
390 * Returns the time of day in a timespec.
391 */
392void timekeeping_clocktai(struct timespec *ts)
393{
394 struct timekeeper *tk = &timekeeper;
395 unsigned long seq;
396 u64 nsecs;
397
398 WARN_ON(timekeeping_suspended);
399
400 do {
401 seq = read_seqcount_begin(&timekeeper_seq);
402
403 ts->tv_sec = tk->xtime_sec + tk->tai_offset;
404 nsecs = timekeeping_get_ns(tk);
405
406 } while (read_seqcount_retry(&timekeeper_seq, seq));
407
408 ts->tv_nsec = 0;
409 timespec_add_ns(ts, nsecs);
410
411}
412EXPORT_SYMBOL(timekeeping_clocktai);
413
414
415/**
416 * ktime_get_clocktai - Returns the TAI time of day in a ktime
417 *
418 * Returns the time of day in a ktime.
419 */
420ktime_t ktime_get_clocktai(void)
421{
422 struct timespec ts;
423
424 timekeeping_clocktai(&ts);
425 return timespec_to_ktime(ts);
426}
427EXPORT_SYMBOL(ktime_get_clocktai);
428
382#ifdef CONFIG_NTP_PPS 429#ifdef CONFIG_NTP_PPS
383 430
384/** 431/**
@@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
399 WARN_ON_ONCE(timekeeping_suspended); 446 WARN_ON_ONCE(timekeeping_suspended);
400 447
401 do { 448 do {
402 seq = read_seqbegin(&tk->lock); 449 seq = read_seqcount_begin(&timekeeper_seq);
403 450
404 *ts_raw = tk->raw_time; 451 *ts_raw = tk->raw_time;
405 ts_real->tv_sec = tk->xtime_sec; 452 ts_real->tv_sec = tk->xtime_sec;
@@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
408 nsecs_raw = timekeeping_get_ns_raw(tk); 455 nsecs_raw = timekeeping_get_ns_raw(tk);
409 nsecs_real = timekeeping_get_ns(tk); 456 nsecs_real = timekeeping_get_ns(tk);
410 457
411 } while (read_seqretry(&tk->lock, seq)); 458 } while (read_seqcount_retry(&timekeeper_seq, seq));
412 459
413 timespec_add_ns(ts_raw, nsecs_raw); 460 timespec_add_ns(ts_raw, nsecs_raw);
414 timespec_add_ns(ts_real, nsecs_real); 461 timespec_add_ns(ts_real, nsecs_real);
@@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv)
448 if (!timespec_valid_strict(tv)) 495 if (!timespec_valid_strict(tv))
449 return -EINVAL; 496 return -EINVAL;
450 497
451 write_seqlock_irqsave(&tk->lock, flags); 498 raw_spin_lock_irqsave(&timekeeper_lock, flags);
499 write_seqcount_begin(&timekeeper_seq);
452 500
453 timekeeping_forward_now(tk); 501 timekeeping_forward_now(tk);
454 502
@@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv)
460 508
461 tk_set_xtime(tk, tv); 509 tk_set_xtime(tk, tv);
462 510
463 timekeeping_update(tk, true); 511 timekeeping_update(tk, true, true);
464 512
465 write_sequnlock_irqrestore(&tk->lock, flags); 513 write_seqcount_end(&timekeeper_seq);
514 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
466 515
467 /* signal hrtimers about time change */ 516 /* signal hrtimers about time change */
468 clock_was_set(); 517 clock_was_set();
@@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts)
487 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 536 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
488 return -EINVAL; 537 return -EINVAL;
489 538
490 write_seqlock_irqsave(&tk->lock, flags); 539 raw_spin_lock_irqsave(&timekeeper_lock, flags);
540 write_seqcount_begin(&timekeeper_seq);
491 541
492 timekeeping_forward_now(tk); 542 timekeeping_forward_now(tk);
493 543
@@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts)
502 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 552 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
503 553
504error: /* even if we error out, we forwarded the time, so call update */ 554error: /* even if we error out, we forwarded the time, so call update */
505 timekeeping_update(tk, true); 555 timekeeping_update(tk, true, true);
506 556
507 write_sequnlock_irqrestore(&tk->lock, flags); 557 write_seqcount_end(&timekeeper_seq);
558 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
508 559
509 /* signal hrtimers about time change */ 560 /* signal hrtimers about time change */
510 clock_was_set(); 561 clock_was_set();
@@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */
513} 564}
514EXPORT_SYMBOL(timekeeping_inject_offset); 565EXPORT_SYMBOL(timekeeping_inject_offset);
515 566
567
568/**
569 * timekeeping_get_tai_offset - Returns current TAI offset from UTC
570 *
571 */
572s32 timekeeping_get_tai_offset(void)
573{
574 struct timekeeper *tk = &timekeeper;
575 unsigned int seq;
576 s32 ret;
577
578 do {
579 seq = read_seqcount_begin(&timekeeper_seq);
580 ret = tk->tai_offset;
581 } while (read_seqcount_retry(&timekeeper_seq, seq));
582
583 return ret;
584}
585
586/**
587 * __timekeeping_set_tai_offset - Lock free worker function
588 *
589 */
590static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
591{
592 tk->tai_offset = tai_offset;
593 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
594}
595
596/**
597 * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
598 *
599 */
600void timekeeping_set_tai_offset(s32 tai_offset)
601{
602 struct timekeeper *tk = &timekeeper;
603 unsigned long flags;
604
605 raw_spin_lock_irqsave(&timekeeper_lock, flags);
606 write_seqcount_begin(&timekeeper_seq);
607 __timekeeping_set_tai_offset(tk, tai_offset);
608 write_seqcount_end(&timekeeper_seq);
609 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
610 clock_was_set();
611}
612
516/** 613/**
517 * change_clocksource - Swaps clocksources if a new one is available 614 * change_clocksource - Swaps clocksources if a new one is available
518 * 615 *
@@ -526,7 +623,8 @@ static int change_clocksource(void *data)
526 623
527 new = (struct clocksource *) data; 624 new = (struct clocksource *) data;
528 625
529 write_seqlock_irqsave(&tk->lock, flags); 626 raw_spin_lock_irqsave(&timekeeper_lock, flags);
627 write_seqcount_begin(&timekeeper_seq);
530 628
531 timekeeping_forward_now(tk); 629 timekeeping_forward_now(tk);
532 if (!new->enable || new->enable(new) == 0) { 630 if (!new->enable || new->enable(new) == 0) {
@@ -535,9 +633,10 @@ static int change_clocksource(void *data)
535 if (old->disable) 633 if (old->disable)
536 old->disable(old); 634 old->disable(old);
537 } 635 }
538 timekeeping_update(tk, true); 636 timekeeping_update(tk, true, true);
539 637
540 write_sequnlock_irqrestore(&tk->lock, flags); 638 write_seqcount_end(&timekeeper_seq);
639 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
541 640
542 return 0; 641 return 0;
543} 642}
@@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts)
587 s64 nsecs; 686 s64 nsecs;
588 687
589 do { 688 do {
590 seq = read_seqbegin(&tk->lock); 689 seq = read_seqcount_begin(&timekeeper_seq);
591 nsecs = timekeeping_get_ns_raw(tk); 690 nsecs = timekeeping_get_ns_raw(tk);
592 *ts = tk->raw_time; 691 *ts = tk->raw_time;
593 692
594 } while (read_seqretry(&tk->lock, seq)); 693 } while (read_seqcount_retry(&timekeeper_seq, seq));
595 694
596 timespec_add_ns(ts, nsecs); 695 timespec_add_ns(ts, nsecs);
597} 696}
@@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void)
607 int ret; 706 int ret;
608 707
609 do { 708 do {
610 seq = read_seqbegin(&tk->lock); 709 seq = read_seqcount_begin(&timekeeper_seq);
611 710
612 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 711 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
613 712
614 } while (read_seqretry(&tk->lock, seq)); 713 } while (read_seqcount_retry(&timekeeper_seq, seq));
615 714
616 return ret; 715 return ret;
617} 716}
@@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void)
626 u64 ret; 725 u64 ret;
627 726
628 do { 727 do {
629 seq = read_seqbegin(&tk->lock); 728 seq = read_seqcount_begin(&timekeeper_seq);
630 729
631 ret = tk->clock->max_idle_ns; 730 ret = tk->clock->max_idle_ns;
632 731
633 } while (read_seqretry(&tk->lock, seq)); 732 } while (read_seqcount_retry(&timekeeper_seq, seq));
634 733
635 return ret; 734 return ret;
636} 735}
@@ -693,11 +792,10 @@ void __init timekeeping_init(void)
693 boot.tv_nsec = 0; 792 boot.tv_nsec = 0;
694 } 793 }
695 794
696 seqlock_init(&tk->lock); 795 raw_spin_lock_irqsave(&timekeeper_lock, flags);
697 796 write_seqcount_begin(&timekeeper_seq);
698 ntp_init(); 797 ntp_init();
699 798
700 write_seqlock_irqsave(&tk->lock, flags);
701 clock = clocksource_default_clock(); 799 clock = clocksource_default_clock();
702 if (clock->enable) 800 if (clock->enable)
703 clock->enable(clock); 801 clock->enable(clock);
@@ -716,7 +814,10 @@ void __init timekeeping_init(void)
716 tmp.tv_nsec = 0; 814 tmp.tv_nsec = 0;
717 tk_set_sleep_time(tk, tmp); 815 tk_set_sleep_time(tk, tmp);
718 816
719 write_sequnlock_irqrestore(&tk->lock, flags); 817 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
818
819 write_seqcount_end(&timekeeper_seq);
820 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
720} 821}
721 822
722/* time in seconds when suspend began */ 823/* time in seconds when suspend began */
@@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
764 if (has_persistent_clock()) 865 if (has_persistent_clock())
765 return; 866 return;
766 867
767 write_seqlock_irqsave(&tk->lock, flags); 868 raw_spin_lock_irqsave(&timekeeper_lock, flags);
869 write_seqcount_begin(&timekeeper_seq);
768 870
769 timekeeping_forward_now(tk); 871 timekeeping_forward_now(tk);
770 872
771 __timekeeping_inject_sleeptime(tk, delta); 873 __timekeeping_inject_sleeptime(tk, delta);
772 874
773 timekeeping_update(tk, true); 875 timekeeping_update(tk, true, true);
774 876
775 write_sequnlock_irqrestore(&tk->lock, flags); 877 write_seqcount_end(&timekeeper_seq);
878 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
776 879
777 /* signal hrtimers about time change */ 880 /* signal hrtimers about time change */
778 clock_was_set(); 881 clock_was_set();
@@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
788static void timekeeping_resume(void) 891static void timekeeping_resume(void)
789{ 892{
790 struct timekeeper *tk = &timekeeper; 893 struct timekeeper *tk = &timekeeper;
894 struct clocksource *clock = tk->clock;
791 unsigned long flags; 895 unsigned long flags;
792 struct timespec ts; 896 struct timespec ts_new, ts_delta;
897 cycle_t cycle_now, cycle_delta;
898 bool suspendtime_found = false;
793 899
794 read_persistent_clock(&ts); 900 read_persistent_clock(&ts_new);
795 901
796 clockevents_resume(); 902 clockevents_resume();
797 clocksource_resume(); 903 clocksource_resume();
798 904
799 write_seqlock_irqsave(&tk->lock, flags); 905 raw_spin_lock_irqsave(&timekeeper_lock, flags);
906 write_seqcount_begin(&timekeeper_seq);
907
908 /*
909 * After system resumes, we need to calculate the suspended time and
910 * compensate it for the OS time. There are 3 sources that could be
911 * used: Nonstop clocksource during suspend, persistent clock and rtc
912 * device.
913 *
914 * One specific platform may have 1 or 2 or all of them, and the
915 * preference will be:
916 * suspend-nonstop clocksource -> persistent clock -> rtc
917 * The less preferred source will only be tried if there is no better
918 * usable source. The rtc part is handled separately in rtc core code.
919 */
920 cycle_now = clock->read(clock);
921 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
922 cycle_now > clock->cycle_last) {
923 u64 num, max = ULLONG_MAX;
924 u32 mult = clock->mult;
925 u32 shift = clock->shift;
926 s64 nsec = 0;
927
928 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
929
930 /*
931 * "cycle_delta * mutl" may cause 64 bits overflow, if the
932 * suspended time is too long. In that case we need do the
933 * 64 bits math carefully
934 */
935 do_div(max, mult);
936 if (cycle_delta > max) {
937 num = div64_u64(cycle_delta, max);
938 nsec = (((u64) max * mult) >> shift) * num;
939 cycle_delta -= num * max;
940 }
941 nsec += ((u64) cycle_delta * mult) >> shift;
800 942
801 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 943 ts_delta = ns_to_timespec(nsec);
802 ts = timespec_sub(ts, timekeeping_suspend_time); 944 suspendtime_found = true;
803 __timekeeping_inject_sleeptime(tk, &ts); 945 } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
946 ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
947 suspendtime_found = true;
804 } 948 }
805 /* re-base the last cycle value */ 949
806 tk->clock->cycle_last = tk->clock->read(tk->clock); 950 if (suspendtime_found)
951 __timekeeping_inject_sleeptime(tk, &ts_delta);
952
953 /* Re-base the last cycle value */
954 tk->cycle_last = clock->cycle_last = cycle_now;
807 tk->ntp_error = 0; 955 tk->ntp_error = 0;
808 timekeeping_suspended = 0; 956 timekeeping_suspended = 0;
809 timekeeping_update(tk, false); 957 timekeeping_update(tk, false, true);
810 write_sequnlock_irqrestore(&tk->lock, flags); 958 write_seqcount_end(&timekeeper_seq);
959 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
811 960
812 touch_softlockup_watchdog(); 961 touch_softlockup_watchdog();
813 962
@@ -826,7 +975,16 @@ static int timekeeping_suspend(void)
826 975
827 read_persistent_clock(&timekeeping_suspend_time); 976 read_persistent_clock(&timekeeping_suspend_time);
828 977
829 write_seqlock_irqsave(&tk->lock, flags); 978 /*
979 * On some systems the persistent_clock can not be detected at
980 * timekeeping_init by its return value, so if we see a valid
981 * value returned, update the persistent_clock_exists flag.
982 */
983 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
984 persistent_clock_exist = true;
985
986 raw_spin_lock_irqsave(&timekeeper_lock, flags);
987 write_seqcount_begin(&timekeeper_seq);
830 timekeeping_forward_now(tk); 988 timekeeping_forward_now(tk);
831 timekeeping_suspended = 1; 989 timekeeping_suspended = 1;
832 990
@@ -849,7 +1007,8 @@ static int timekeeping_suspend(void)
849 timekeeping_suspend_time = 1007 timekeeping_suspend_time =
850 timespec_add(timekeeping_suspend_time, delta_delta); 1008 timespec_add(timekeeping_suspend_time, delta_delta);
851 } 1009 }
852 write_sequnlock_irqrestore(&tk->lock, flags); 1010 write_seqcount_end(&timekeeper_seq);
1011 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
853 1012
854 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1013 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
855 clocksource_suspend(); 1014 clocksource_suspend();
@@ -1099,6 +1258,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1099 tk_set_wall_to_mono(tk, 1258 tk_set_wall_to_mono(tk,
1100 timespec_sub(tk->wall_to_monotonic, ts)); 1259 timespec_sub(tk->wall_to_monotonic, ts));
1101 1260
1261 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1262
1102 clock_was_set_delayed(); 1263 clock_was_set_delayed();
1103 } 1264 }
1104 } 1265 }
@@ -1116,15 +1277,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1116static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, 1277static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1117 u32 shift) 1278 u32 shift)
1118{ 1279{
1280 cycle_t interval = tk->cycle_interval << shift;
1119 u64 raw_nsecs; 1281 u64 raw_nsecs;
1120 1282
1121 /* If the offset is smaller then a shifted interval, do nothing */ 1283 /* If the offset is smaller then a shifted interval, do nothing */
1122 if (offset < tk->cycle_interval<<shift) 1284 if (offset < interval)
1123 return offset; 1285 return offset;
1124 1286
1125 /* Accumulate one shifted interval */ 1287 /* Accumulate one shifted interval */
1126 offset -= tk->cycle_interval << shift; 1288 offset -= interval;
1127 tk->clock->cycle_last += tk->cycle_interval << shift; 1289 tk->cycle_last += interval;
1128 1290
1129 tk->xtime_nsec += tk->xtime_interval << shift; 1291 tk->xtime_nsec += tk->xtime_interval << shift;
1130 accumulate_nsecs_to_secs(tk); 1292 accumulate_nsecs_to_secs(tk);
@@ -1181,27 +1343,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1181static void update_wall_time(void) 1343static void update_wall_time(void)
1182{ 1344{
1183 struct clocksource *clock; 1345 struct clocksource *clock;
1184 struct timekeeper *tk = &timekeeper; 1346 struct timekeeper *real_tk = &timekeeper;
1347 struct timekeeper *tk = &shadow_timekeeper;
1185 cycle_t offset; 1348 cycle_t offset;
1186 int shift = 0, maxshift; 1349 int shift = 0, maxshift;
1187 unsigned long flags; 1350 unsigned long flags;
1188 1351
1189 write_seqlock_irqsave(&tk->lock, flags); 1352 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1190 1353
1191 /* Make sure we're fully resumed: */ 1354 /* Make sure we're fully resumed: */
1192 if (unlikely(timekeeping_suspended)) 1355 if (unlikely(timekeeping_suspended))
1193 goto out; 1356 goto out;
1194 1357
1195 clock = tk->clock; 1358 clock = real_tk->clock;
1196 1359
1197#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1360#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1198 offset = tk->cycle_interval; 1361 offset = real_tk->cycle_interval;
1199#else 1362#else
1200 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1363 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1201#endif 1364#endif
1202 1365
1203 /* Check if there's really nothing to do */ 1366 /* Check if there's really nothing to do */
1204 if (offset < tk->cycle_interval) 1367 if (offset < real_tk->cycle_interval)
1205 goto out; 1368 goto out;
1206 1369
1207 /* 1370 /*
@@ -1238,11 +1401,24 @@ static void update_wall_time(void)
1238 */ 1401 */
1239 accumulate_nsecs_to_secs(tk); 1402 accumulate_nsecs_to_secs(tk);
1240 1403
1241 timekeeping_update(tk, false); 1404 write_seqcount_begin(&timekeeper_seq);
1242 1405 /* Update clock->cycle_last with the new value */
1406 clock->cycle_last = tk->cycle_last;
1407 /*
1408 * Update the real timekeeper.
1409 *
1410 * We could avoid this memcpy by switching pointers, but that
1411 * requires changes to all other timekeeper usage sites as
1412 * well, i.e. move the timekeeper pointer getter into the
1413 * spinlocked/seqcount protected sections. And we trade this
1414 * memcpy under the timekeeper_seq against one before we start
1415 * updating.
1416 */
1417 memcpy(real_tk, tk, sizeof(*tk));
1418 timekeeping_update(real_tk, false, false);
1419 write_seqcount_end(&timekeeper_seq);
1243out: 1420out:
1244 write_sequnlock_irqrestore(&tk->lock, flags); 1421 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1245
1246} 1422}
1247 1423
1248/** 1424/**
@@ -1289,13 +1465,13 @@ void get_monotonic_boottime(struct timespec *ts)
1289 WARN_ON(timekeeping_suspended); 1465 WARN_ON(timekeeping_suspended);
1290 1466
1291 do { 1467 do {
1292 seq = read_seqbegin(&tk->lock); 1468 seq = read_seqcount_begin(&timekeeper_seq);
1293 ts->tv_sec = tk->xtime_sec; 1469 ts->tv_sec = tk->xtime_sec;
1294 nsec = timekeeping_get_ns(tk); 1470 nsec = timekeeping_get_ns(tk);
1295 tomono = tk->wall_to_monotonic; 1471 tomono = tk->wall_to_monotonic;
1296 sleep = tk->total_sleep_time; 1472 sleep = tk->total_sleep_time;
1297 1473
1298 } while (read_seqretry(&tk->lock, seq)); 1474 } while (read_seqcount_retry(&timekeeper_seq, seq));
1299 1475
1300 ts->tv_sec += tomono.tv_sec + sleep.tv_sec; 1476 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1301 ts->tv_nsec = 0; 1477 ts->tv_nsec = 0;
@@ -1354,10 +1530,10 @@ struct timespec current_kernel_time(void)
1354 unsigned long seq; 1530 unsigned long seq;
1355 1531
1356 do { 1532 do {
1357 seq = read_seqbegin(&tk->lock); 1533 seq = read_seqcount_begin(&timekeeper_seq);
1358 1534
1359 now = tk_xtime(tk); 1535 now = tk_xtime(tk);
1360 } while (read_seqretry(&tk->lock, seq)); 1536 } while (read_seqcount_retry(&timekeeper_seq, seq));
1361 1537
1362 return now; 1538 return now;
1363} 1539}
@@ -1370,11 +1546,11 @@ struct timespec get_monotonic_coarse(void)
1370 unsigned long seq; 1546 unsigned long seq;
1371 1547
1372 do { 1548 do {
1373 seq = read_seqbegin(&tk->lock); 1549 seq = read_seqcount_begin(&timekeeper_seq);
1374 1550
1375 now = tk_xtime(tk); 1551 now = tk_xtime(tk);
1376 mono = tk->wall_to_monotonic; 1552 mono = tk->wall_to_monotonic;
1377 } while (read_seqretry(&tk->lock, seq)); 1553 } while (read_seqcount_retry(&timekeeper_seq, seq));
1378 1554
1379 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1555 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1380 now.tv_nsec + mono.tv_nsec); 1556 now.tv_nsec + mono.tv_nsec);
@@ -1405,11 +1581,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1405 unsigned long seq; 1581 unsigned long seq;
1406 1582
1407 do { 1583 do {
1408 seq = read_seqbegin(&tk->lock); 1584 seq = read_seqcount_begin(&timekeeper_seq);
1409 *xtim = tk_xtime(tk); 1585 *xtim = tk_xtime(tk);
1410 *wtom = tk->wall_to_monotonic; 1586 *wtom = tk->wall_to_monotonic;
1411 *sleep = tk->total_sleep_time; 1587 *sleep = tk->total_sleep_time;
1412 } while (read_seqretry(&tk->lock, seq)); 1588 } while (read_seqcount_retry(&timekeeper_seq, seq));
1413} 1589}
1414 1590
1415#ifdef CONFIG_HIGH_RES_TIMERS 1591#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1421,7 +1597,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1421 * Returns current monotonic time and updates the offsets 1597 * Returns current monotonic time and updates the offsets
1422 * Called from hrtimer_interupt() or retrigger_next_event() 1598 * Called from hrtimer_interupt() or retrigger_next_event()
1423 */ 1599 */
1424ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) 1600ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1601 ktime_t *offs_tai)
1425{ 1602{
1426 struct timekeeper *tk = &timekeeper; 1603 struct timekeeper *tk = &timekeeper;
1427 ktime_t now; 1604 ktime_t now;
@@ -1429,14 +1606,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1429 u64 secs, nsecs; 1606 u64 secs, nsecs;
1430 1607
1431 do { 1608 do {
1432 seq = read_seqbegin(&tk->lock); 1609 seq = read_seqcount_begin(&timekeeper_seq);
1433 1610
1434 secs = tk->xtime_sec; 1611 secs = tk->xtime_sec;
1435 nsecs = timekeeping_get_ns(tk); 1612 nsecs = timekeeping_get_ns(tk);
1436 1613
1437 *offs_real = tk->offs_real; 1614 *offs_real = tk->offs_real;
1438 *offs_boot = tk->offs_boot; 1615 *offs_boot = tk->offs_boot;
1439 } while (read_seqretry(&tk->lock, seq)); 1616 *offs_tai = tk->offs_tai;
1617 } while (read_seqcount_retry(&timekeeper_seq, seq));
1440 1618
1441 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1619 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1442 now = ktime_sub(now, *offs_real); 1620 now = ktime_sub(now, *offs_real);
@@ -1454,15 +1632,79 @@ ktime_t ktime_get_monotonic_offset(void)
1454 struct timespec wtom; 1632 struct timespec wtom;
1455 1633
1456 do { 1634 do {
1457 seq = read_seqbegin(&tk->lock); 1635 seq = read_seqcount_begin(&timekeeper_seq);
1458 wtom = tk->wall_to_monotonic; 1636 wtom = tk->wall_to_monotonic;
1459 } while (read_seqretry(&tk->lock, seq)); 1637 } while (read_seqcount_retry(&timekeeper_seq, seq));
1460 1638
1461 return timespec_to_ktime(wtom); 1639 return timespec_to_ktime(wtom);
1462} 1640}
1463EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); 1641EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1464 1642
1465/** 1643/**
1644 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
1645 */
1646int do_adjtimex(struct timex *txc)
1647{
1648 struct timekeeper *tk = &timekeeper;
1649 unsigned long flags;
1650 struct timespec ts;
1651 s32 orig_tai, tai;
1652 int ret;
1653
1654 /* Validate the data before disabling interrupts */
1655 ret = ntp_validate_timex(txc);
1656 if (ret)
1657 return ret;
1658
1659 if (txc->modes & ADJ_SETOFFSET) {
1660 struct timespec delta;
1661 delta.tv_sec = txc->time.tv_sec;
1662 delta.tv_nsec = txc->time.tv_usec;
1663 if (!(txc->modes & ADJ_NANO))
1664 delta.tv_nsec *= 1000;
1665 ret = timekeeping_inject_offset(&delta);
1666 if (ret)
1667 return ret;
1668 }
1669
1670 getnstimeofday(&ts);
1671
1672 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1673 write_seqcount_begin(&timekeeper_seq);
1674
1675 orig_tai = tai = tk->tai_offset;
1676 ret = __do_adjtimex(txc, &ts, &tai);
1677
1678 if (tai != orig_tai) {
1679 __timekeeping_set_tai_offset(tk, tai);
1680 clock_was_set_delayed();
1681 }
1682 write_seqcount_end(&timekeeper_seq);
1683 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1684
1685 return ret;
1686}
1687
1688#ifdef CONFIG_NTP_PPS
1689/**
1690 * hardpps() - Accessor function to NTP __hardpps function
1691 */
1692void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
1693{
1694 unsigned long flags;
1695
1696 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1697 write_seqcount_begin(&timekeeper_seq);
1698
1699 __hardpps(phase_ts, raw_ts);
1700
1701 write_seqcount_end(&timekeeper_seq);
1702 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1703}
1704EXPORT_SYMBOL(hardpps);
1705#endif
1706
1707/**
1466 * xtime_update() - advances the timekeeping infrastructure 1708 * xtime_update() - advances the timekeeping infrastructure
1467 * @ticks: number of ticks, that have elapsed since the last call. 1709 * @ticks: number of ticks, that have elapsed since the last call.
1468 * 1710 *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164b..3bdf28323012 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
20 20
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23
24struct timer_list_iter {
25 int cpu;
26 bool second_pass;
27 u64 now;
28};
29
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); 30typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24 31
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); 32DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
133 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 140 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
134 int i; 141 int i;
135 142
136 SEQ_printf(m, "\n");
137 SEQ_printf(m, "cpu: %d\n", cpu); 143 SEQ_printf(m, "cpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 144 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i); 145 SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
187 193
188#undef P 194#undef P
189#undef P_ns 195#undef P_ns
196 SEQ_printf(m, "\n");
190} 197}
191 198
192#ifdef CONFIG_GENERIC_CLOCKEVENTS 199#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
195{ 202{
196 struct clock_event_device *dev = td->evtdev; 203 struct clock_event_device *dev = td->evtdev;
197 204
198 SEQ_printf(m, "\n");
199 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 205 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
200 if (cpu < 0) 206 if (cpu < 0)
201 SEQ_printf(m, "Broadcast device\n"); 207 SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
230 print_name_offset(m, dev->event_handler); 236 print_name_offset(m, dev->event_handler);
231 SEQ_printf(m, "\n"); 237 SEQ_printf(m, "\n");
232 SEQ_printf(m, " retries: %lu\n", dev->retries); 238 SEQ_printf(m, " retries: %lu\n", dev->retries);
239 SEQ_printf(m, "\n");
233} 240}
234 241
235static void timer_list_show_tickdevices(struct seq_file *m) 242static void timer_list_show_tickdevices_header(struct seq_file *m)
236{ 243{
237 int cpu;
238
239#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 244#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
240 print_tickdevice(m, tick_get_broadcast_device(), -1); 245 print_tickdevice(m, tick_get_broadcast_device(), -1);
241 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 246 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m)
246#endif 251#endif
247 SEQ_printf(m, "\n"); 252 SEQ_printf(m, "\n");
248#endif 253#endif
249 for_each_online_cpu(cpu)
250 print_tickdevice(m, tick_get_device(cpu), cpu);
251 SEQ_printf(m, "\n");
252} 254}
253#else
254static void timer_list_show_tickdevices(struct seq_file *m) { }
255#endif 255#endif
256 256
257static inline void timer_list_header(struct seq_file *m, u64 now)
258{
259 SEQ_printf(m, "Timer List Version: v0.7\n");
260 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
261 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
262 SEQ_printf(m, "\n");
263}
264
257static int timer_list_show(struct seq_file *m, void *v) 265static int timer_list_show(struct seq_file *m, void *v)
258{ 266{
267 struct timer_list_iter *iter = v;
268 u64 now = ktime_to_ns(ktime_get());
269
270 if (iter->cpu == -1 && !iter->second_pass)
271 timer_list_header(m, now);
272 else if (!iter->second_pass)
273 print_cpu(m, iter->cpu, iter->now);
274#ifdef CONFIG_GENERIC_CLOCKEVENTS
275 else if (iter->cpu == -1 && iter->second_pass)
276 timer_list_show_tickdevices_header(m);
277 else
278 print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
279#endif
280 return 0;
281}
282
283void sysrq_timer_list_show(void)
284{
259 u64 now = ktime_to_ns(ktime_get()); 285 u64 now = ktime_to_ns(ktime_get());
260 int cpu; 286 int cpu;
261 287
262 SEQ_printf(m, "Timer List Version: v0.7\n"); 288 timer_list_header(NULL, now);
263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
265 289
266 for_each_online_cpu(cpu) 290 for_each_online_cpu(cpu)
267 print_cpu(m, cpu, now); 291 print_cpu(NULL, cpu, now);
268 292
269 SEQ_printf(m, "\n"); 293#ifdef CONFIG_GENERIC_CLOCKEVENTS
270 timer_list_show_tickdevices(m); 294 timer_list_show_tickdevices_header(NULL);
295 for_each_online_cpu(cpu)
296 print_tickdevice(NULL, tick_get_device(cpu), cpu);
297#endif
298 return;
299}
271 300
272 return 0; 301static void *timer_list_start(struct seq_file *file, loff_t *offset)
302{
303 struct timer_list_iter *iter = file->private;
304
305 if (!*offset) {
306 iter->cpu = -1;
307 iter->now = ktime_to_ns(ktime_get());
308 } else if (iter->cpu >= nr_cpu_ids) {
309#ifdef CONFIG_GENERIC_CLOCKEVENTS
310 if (!iter->second_pass) {
311 iter->cpu = -1;
312 iter->second_pass = true;
313 } else
314 return NULL;
315#else
316 return NULL;
317#endif
318 }
319 return iter;
273} 320}
274 321
275void sysrq_timer_list_show(void) 322static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
323{
324 struct timer_list_iter *iter = file->private;
325 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
326 ++*offset;
327 return timer_list_start(file, offset);
328}
329
330static void timer_list_stop(struct seq_file *seq, void *v)
276{ 331{
277 timer_list_show(NULL, NULL);
278} 332}
279 333
334static const struct seq_operations timer_list_sops = {
335 .start = timer_list_start,
336 .next = timer_list_next,
337 .stop = timer_list_stop,
338 .show = timer_list_show,
339};
340
280static int timer_list_open(struct inode *inode, struct file *filp) 341static int timer_list_open(struct inode *inode, struct file *filp)
281{ 342{
282 return single_open(filp, timer_list_show, NULL); 343 return seq_open_private(filp, &timer_list_sops,
344 sizeof(struct timer_list_iter));
283} 345}
284 346
285static const struct file_operations timer_list_fops = { 347static const struct file_operations timer_list_fops = {
286 .open = timer_list_open, 348 .open = timer_list_open,
287 .read = seq_read, 349 .read = seq_read,
288 .llseek = seq_lseek, 350 .llseek = seq_lseek,
289 .release = single_release, 351 .release = seq_release_private,
290}; 352};
291 353
292static int __init init_timer_list_procfs(void) 354static int __init init_timer_list_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..15ffdb3f1948 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/kernel/timer.c 2 * linux/kernel/timer.c
3 * 3 *
4 * Kernel internal timers, basic process system calls 4 * Kernel internal timers
5 * 5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 7 *
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h> 42#include <linux/sched/sysctl.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/compat.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/unistd.h> 47#include <asm/unistd.h>
@@ -738,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
738 739
739 cpu = smp_processor_id(); 740 cpu = smp_processor_id();
740 741
741#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 742#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
742 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) 743 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
743 cpu = get_nohz_timer_target(); 744 cpu = get_nohz_timer_target();
744#endif 745#endif
@@ -930,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
930 debug_activate(timer, timer->expires); 931 debug_activate(timer, timer->expires);
931 internal_add_timer(base, timer); 932 internal_add_timer(base, timer);
932 /* 933 /*
933 * Check whether the other CPU is idle and needs to be 934 * Check whether the other CPU is in dynticks mode and needs
934 * triggered to reevaluate the timer wheel when nohz is 935 * to be triggered to reevaluate the timer wheel.
935 * active. We are protected against the other CPU fiddling 936 * We are protected against the other CPU fiddling
936 * with the timer by holding the timer base lock. This also 937 * with the timer by holding the timer base lock. This also
937 * makes sure that a CPU on the way to idle can not evaluate 938 * makes sure that a CPU on the way to stop its tick can not
938 * the timer wheel. 939 * evaluate the timer wheel.
939 */ 940 */
940 wake_up_idle_cpu(cpu); 941 wake_up_nohz_cpu(cpu);
941 spin_unlock_irqrestore(&base->lock, flags); 942 spin_unlock_irqrestore(&base->lock, flags);
942} 943}
943EXPORT_SYMBOL_GPL(add_timer_on); 944EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1188,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base)
1188 spin_unlock_irq(&base->lock); 1189 spin_unlock_irq(&base->lock);
1189} 1190}
1190 1191
1191#ifdef CONFIG_NO_HZ 1192#ifdef CONFIG_NO_HZ_COMMON
1192/* 1193/*
1193 * Find out when the next timer event is due to happen. This 1194 * Find out when the next timer event is due to happen. This
1194 * is used on S/390 to stop all activity when a CPU is idle. 1195 * is used on S/390 to stop all activity when a CPU is idle.
@@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1395 1396
1396#endif 1397#endif
1397 1398
1398/**
1399 * sys_getpid - return the thread group id of the current process
1400 *
1401 * Note, despite the name, this returns the tgid not the pid. The tgid and
1402 * the pid are identical unless CLONE_THREAD was specified on clone() in
1403 * which case the tgid is the same in all threads of the same group.
1404 *
1405 * This is SMP safe as current->tgid does not change.
1406 */
1407SYSCALL_DEFINE0(getpid)
1408{
1409 return task_tgid_vnr(current);
1410}
1411
1412/*
1413 * Accessing ->real_parent is not SMP-safe, it could
1414 * change from under us. However, we can use a stale
1415 * value of ->real_parent under rcu_read_lock(), see
1416 * release_task()->call_rcu(delayed_put_task_struct).
1417 */
1418SYSCALL_DEFINE0(getppid)
1419{
1420 int pid;
1421
1422 rcu_read_lock();
1423 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1424 rcu_read_unlock();
1425
1426 return pid;
1427}
1428
1429SYSCALL_DEFINE0(getuid)
1430{
1431 /* Only we change this so SMP safe */
1432 return from_kuid_munged(current_user_ns(), current_uid());
1433}
1434
1435SYSCALL_DEFINE0(geteuid)
1436{
1437 /* Only we change this so SMP safe */
1438 return from_kuid_munged(current_user_ns(), current_euid());
1439}
1440
1441SYSCALL_DEFINE0(getgid)
1442{
1443 /* Only we change this so SMP safe */
1444 return from_kgid_munged(current_user_ns(), current_gid());
1445}
1446
1447SYSCALL_DEFINE0(getegid)
1448{
1449 /* Only we change this so SMP safe */
1450 return from_kgid_munged(current_user_ns(), current_egid());
1451}
1452
1453static void process_timeout(unsigned long __data) 1399static void process_timeout(unsigned long __data)
1454{ 1400{
1455 wake_up_process((struct task_struct *)__data); 1401 wake_up_process((struct task_struct *)__data);
@@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1557} 1503}
1558EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1504EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1559 1505
1560/* Thread ID - the internal kernel "pid" */
1561SYSCALL_DEFINE0(gettid)
1562{
1563 return task_pid_vnr(current);
1564}
1565
1566/**
1567 * do_sysinfo - fill in sysinfo struct
1568 * @info: pointer to buffer to fill
1569 */
1570int do_sysinfo(struct sysinfo *info)
1571{
1572 unsigned long mem_total, sav_total;
1573 unsigned int mem_unit, bitcount;
1574 struct timespec tp;
1575
1576 memset(info, 0, sizeof(struct sysinfo));
1577
1578 ktime_get_ts(&tp);
1579 monotonic_to_bootbased(&tp);
1580 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1581
1582 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1583
1584 info->procs = nr_threads;
1585
1586 si_meminfo(info);
1587 si_swapinfo(info);
1588
1589 /*
1590 * If the sum of all the available memory (i.e. ram + swap)
1591 * is less than can be stored in a 32 bit unsigned long then
1592 * we can be binary compatible with 2.2.x kernels. If not,
1593 * well, in that case 2.2.x was broken anyways...
1594 *
1595 * -Erik Andersen <andersee@debian.org>
1596 */
1597
1598 mem_total = info->totalram + info->totalswap;
1599 if (mem_total < info->totalram || mem_total < info->totalswap)
1600 goto out;
1601 bitcount = 0;
1602 mem_unit = info->mem_unit;
1603 while (mem_unit > 1) {
1604 bitcount++;
1605 mem_unit >>= 1;
1606 sav_total = mem_total;
1607 mem_total <<= 1;
1608 if (mem_total < sav_total)
1609 goto out;
1610 }
1611
1612 /*
1613 * If mem_total did not overflow, multiply all memory values by
1614 * info->mem_unit and set it to 1. This leaves things compatible
1615 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1616 * kernels...
1617 */
1618
1619 info->mem_unit = 1;
1620 info->totalram <<= bitcount;
1621 info->freeram <<= bitcount;
1622 info->sharedram <<= bitcount;
1623 info->bufferram <<= bitcount;
1624 info->totalswap <<= bitcount;
1625 info->freeswap <<= bitcount;
1626 info->totalhigh <<= bitcount;
1627 info->freehigh <<= bitcount;
1628
1629out:
1630 return 0;
1631}
1632
1633SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
1634{
1635 struct sysinfo val;
1636
1637 do_sysinfo(&val);
1638
1639 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1640 return -EFAULT;
1641
1642 return 0;
1643}
1644
1645static int __cpuinit init_timers_cpu(int cpu) 1506static int __cpuinit init_timers_cpu(int cpu)
1646{ 1507{
1647 int j; 1508 int j;
@@ -1678,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu)
1678 boot_done = 1; 1539 boot_done = 1;
1679 base = &boot_tvec_bases; 1540 base = &boot_tvec_bases;
1680 } 1541 }
1542 spin_lock_init(&base->lock);
1681 tvec_base_done[cpu] = 1; 1543 tvec_base_done[cpu] = 1;
1682 } else { 1544 } else {
1683 base = per_cpu(tvec_bases, cpu); 1545 base = per_cpu(tvec_bases, cpu);
1684 } 1546 }
1685 1547
1686 spin_lock_init(&base->lock);
1687 1548
1688 for (j = 0; j < TVN_SIZE; j++) { 1549 for (j = 0; j < TVN_SIZE; j++) {
1689 INIT_LIST_HEAD(base->tv5.vec + j); 1550 INIT_LIST_HEAD(base->tv5.vec + j);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fc382d6e2765..015f85aaca08 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -71,6 +71,7 @@ config TRACE_CLOCK
71config RING_BUFFER 71config RING_BUFFER
72 bool 72 bool
73 select TRACE_CLOCK 73 select TRACE_CLOCK
74 select IRQ_WORK
74 75
75config FTRACE_NMI_ENTER 76config FTRACE_NMI_ENTER
76 bool 77 bool
@@ -107,7 +108,6 @@ config TRACING
107 select BINARY_PRINTF 108 select BINARY_PRINTF
108 select EVENT_TRACING 109 select EVENT_TRACING
109 select TRACE_CLOCK 110 select TRACE_CLOCK
110 select IRQ_WORK
111 111
112config GENERIC_TRACER 112config GENERIC_TRACER
113 bool 113 bool
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
176 select GENERIC_TRACER 176 select GENERIC_TRACER
177 select TRACER_MAX_TRACE 177 select TRACER_MAX_TRACE
178 select RING_BUFFER_ALLOW_SWAP 178 select RING_BUFFER_ALLOW_SWAP
179 select TRACER_SNAPSHOT
180 select TRACER_SNAPSHOT_PER_CPU_SWAP
179 help 181 help
180 This option measures the time spent in irqs-off critical 182 This option measures the time spent in irqs-off critical
181 sections, with microsecond accuracy. 183 sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
198 select GENERIC_TRACER 200 select GENERIC_TRACER
199 select TRACER_MAX_TRACE 201 select TRACER_MAX_TRACE
200 select RING_BUFFER_ALLOW_SWAP 202 select RING_BUFFER_ALLOW_SWAP
203 select TRACER_SNAPSHOT
204 select TRACER_SNAPSHOT_PER_CPU_SWAP
201 help 205 help
202 This option measures the time spent in preemption-off critical 206 This option measures the time spent in preemption-off critical
203 sections, with microsecond accuracy. 207 sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
217 select GENERIC_TRACER 221 select GENERIC_TRACER
218 select CONTEXT_SWITCH_TRACER 222 select CONTEXT_SWITCH_TRACER
219 select TRACER_MAX_TRACE 223 select TRACER_MAX_TRACE
224 select TRACER_SNAPSHOT
220 help 225 help
221 This tracer tracks the latency of the highest priority task 226 This tracer tracks the latency of the highest priority task
222 to be scheduled in, starting from the point it has woken up. 227 to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
248 echo 1 > /sys/kernel/debug/tracing/snapshot 253 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot 254 cat snapshot
250 255
256config TRACER_SNAPSHOT_PER_CPU_SWAP
257 bool "Allow snapshot to swap per CPU"
258 depends on TRACER_SNAPSHOT
259 select RING_BUFFER_ALLOW_SWAP
260 help
261 Allow doing a snapshot of a single CPU buffer instead of a
262 full swap (all buffers). If this is set, then the following is
263 allowed:
264
265 echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
266
267 After which, only the tracing buffer for CPU 2 was swapped with
268 the main tracing buffer, and the other CPU buffers remain the same.
269
270 When this is enabled, this adds a little more overhead to the
271 trace recording, as it needs to add some checks to synchronize
272 recording with swaps. But this does not affect the performance
273 of the overall system. This is enabled by default when the preempt
274 or irq latency tracers are enabled, as those need to swap as well
275 and already adds the overhead (plus a lot more).
276
251config TRACE_BRANCH_PROFILING 277config TRACE_BRANCH_PROFILING
252 bool 278 bool
253 select GENERIC_TRACER 279 select GENERIC_TRACER
@@ -524,6 +550,29 @@ config RING_BUFFER_BENCHMARK
524 550
525 If unsure, say N. 551 If unsure, say N.
526 552
553config RING_BUFFER_STARTUP_TEST
554 bool "Ring buffer startup self test"
555 depends on RING_BUFFER
556 help
557 Run a simple self test on the ring buffer on boot up. Late in the
558 kernel boot sequence, the test will start that kicks off
559 a thread per cpu. Each thread will write various size events
560 into the ring buffer. Another thread is created to send IPIs
561 to each of the threads, where the IPI handler will also write
562 to the ring buffer, to test/stress the nesting ability.
563 If any anomalies are discovered, a warning will be displayed
564 and all ring buffers will be disabled.
565
566 The test runs for 10 seconds. This will slow your boot time
567 by at least 10 more seconds.
568
569 At the end of the test, statics and more checks are done.
570 It will output the stats of each per cpu buffer. What
571 was written, the sizes, what was read, what was lost, and
572 other similar details.
573
574 If unsure, say N
575
527endif # FTRACE 576endif # FTRACE
528 577
529endif # TRACING_SUPPORT 578endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..b8b8560bfb95 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
72 bool blk_tracer = blk_tracer_enabled; 72 bool blk_tracer = blk_tracer_enabled;
73 73
74 if (blk_tracer) { 74 if (blk_tracer) {
75 buffer = blk_tr->buffer; 75 buffer = blk_tr->trace_buffer.buffer;
76 pc = preempt_count(); 76 pc = preempt_count();
77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
78 sizeof(*t) + len, 78 sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
218 if (blk_tracer) { 218 if (blk_tracer) {
219 tracing_record_cmdline(current); 219 tracing_record_cmdline(current);
220 220
221 buffer = blk_tr->buffer; 221 buffer = blk_tr->trace_buffer.buffer;
222 pc = preempt_count(); 222 pc = preempt_count();
223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
224 sizeof(*t) + pdu_len, 224 sizeof(*t) + pdu_len,
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
749} 743}
750 744
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
781} 775}
782 776
783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) 777static void blk_add_trace_bio_complete(void *ignore,
778 struct request_queue *q, struct bio *bio,
779 int error)
784{ 780{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
802} 782}
803 783
@@ -1828,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1828 1808
1829 rwbs[i] = '\0'; 1809 rwbs[i] = '\0';
1830} 1810}
1811EXPORT_SYMBOL_GPL(blk_fill_rwbs);
1831 1812
1832#endif /* CONFIG_EVENT_TRACING */ 1813#endif /* CONFIG_EVENT_TRACING */
1833 1814
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ab25b88aae56..6c508ff33c62 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,9 +64,16 @@
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) 65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66 66
67#ifdef CONFIG_DYNAMIC_FTRACE
68#define INIT_REGEX_LOCK(opsname) \
69 .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
70#else
71#define INIT_REGEX_LOCK(opsname)
72#endif
73
67static struct ftrace_ops ftrace_list_end __read_mostly = { 74static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub, 75 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 76 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
70}; 77};
71 78
72/* ftrace_enabled is a method to turn ftrace on or off */ 79/* ftrace_enabled is a method to turn ftrace on or off */
@@ -113,24 +120,34 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
113 120
114/* 121/*
115 * Traverse the ftrace_global_list, invoking all entries. The reason that we 122 * Traverse the ftrace_global_list, invoking all entries. The reason that we
116 * can use rcu_dereference_raw() is that elements removed from this list 123 * can use rcu_dereference_raw_notrace() is that elements removed from this list
117 * are simply leaked, so there is no need to interact with a grace-period 124 * are simply leaked, so there is no need to interact with a grace-period
118 * mechanism. The rcu_dereference_raw() calls are needed to handle 125 * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
119 * concurrent insertions into the ftrace_global_list. 126 * concurrent insertions into the ftrace_global_list.
120 * 127 *
121 * Silly Alpha and silly pointer-speculation compiler optimizations! 128 * Silly Alpha and silly pointer-speculation compiler optimizations!
122 */ 129 */
123#define do_for_each_ftrace_op(op, list) \ 130#define do_for_each_ftrace_op(op, list) \
124 op = rcu_dereference_raw(list); \ 131 op = rcu_dereference_raw_notrace(list); \
125 do 132 do
126 133
127/* 134/*
128 * Optimized for just a single item in the list (as that is the normal case). 135 * Optimized for just a single item in the list (as that is the normal case).
129 */ 136 */
130#define while_for_each_ftrace_op(op) \ 137#define while_for_each_ftrace_op(op) \
131 while (likely(op = rcu_dereference_raw((op)->next)) && \ 138 while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
132 unlikely((op) != &ftrace_list_end)) 139 unlikely((op) != &ftrace_list_end))
133 140
141static inline void ftrace_ops_init(struct ftrace_ops *ops)
142{
143#ifdef CONFIG_DYNAMIC_FTRACE
144 if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
145 mutex_init(&ops->regex_lock);
146 ops->flags |= FTRACE_OPS_FL_INITIALIZED;
147 }
148#endif
149}
150
134/** 151/**
135 * ftrace_nr_registered_ops - return number of ops registered 152 * ftrace_nr_registered_ops - return number of ops registered
136 * 153 *
@@ -486,7 +503,6 @@ struct ftrace_profile_stat {
486#define PROFILES_PER_PAGE \ 503#define PROFILES_PER_PAGE \
487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) 504 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
488 505
489static int ftrace_profile_bits __read_mostly;
490static int ftrace_profile_enabled __read_mostly; 506static int ftrace_profile_enabled __read_mostly;
491 507
492/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ 508/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +510,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
494 510
495static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); 511static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
496 512
497#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ 513#define FTRACE_PROFILE_HASH_BITS 10
514#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
498 515
499static void * 516static void *
500function_stat_next(void *v, int idx) 517function_stat_next(void *v, int idx)
@@ -676,7 +693,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
676 693
677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); 694 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
678 695
679 for (i = 0; i < pages; i++) { 696 for (i = 1; i < pages; i++) {
680 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 697 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
681 if (!pg->next) 698 if (!pg->next)
682 goto out_free; 699 goto out_free;
@@ -694,7 +711,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
694 free_page(tmp); 711 free_page(tmp);
695 } 712 }
696 713
697 free_page((unsigned long)stat->pages);
698 stat->pages = NULL; 714 stat->pages = NULL;
699 stat->start = NULL; 715 stat->start = NULL;
700 716
@@ -725,13 +741,6 @@ static int ftrace_profile_init_cpu(int cpu)
725 if (!stat->hash) 741 if (!stat->hash)
726 return -ENOMEM; 742 return -ENOMEM;
727 743
728 if (!ftrace_profile_bits) {
729 size--;
730
731 for (; size; size >>= 1)
732 ftrace_profile_bits++;
733 }
734
735 /* Preallocate the function profiling pages */ 744 /* Preallocate the function profiling pages */
736 if (ftrace_profile_pages_init(stat) < 0) { 745 if (ftrace_profile_pages_init(stat) < 0) {
737 kfree(stat->hash); 746 kfree(stat->hash);
@@ -764,13 +773,13 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
764 struct hlist_head *hhd; 773 struct hlist_head *hhd;
765 unsigned long key; 774 unsigned long key;
766 775
767 key = hash_long(ip, ftrace_profile_bits); 776 key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
768 hhd = &stat->hash[key]; 777 hhd = &stat->hash[key];
769 778
770 if (hlist_empty(hhd)) 779 if (hlist_empty(hhd))
771 return NULL; 780 return NULL;
772 781
773 hlist_for_each_entry_rcu(rec, hhd, node) { 782 hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
774 if (rec->ip == ip) 783 if (rec->ip == ip)
775 return rec; 784 return rec;
776 } 785 }
@@ -783,7 +792,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
783{ 792{
784 unsigned long key; 793 unsigned long key;
785 794
786 key = hash_long(rec->ip, ftrace_profile_bits); 795 key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
787 hlist_add_head_rcu(&rec->node, &stat->hash[key]); 796 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
788} 797}
789 798
@@ -915,7 +924,8 @@ static void unregister_ftrace_profiler(void)
915#else 924#else
916static struct ftrace_ops ftrace_profile_ops __read_mostly = { 925static struct ftrace_ops ftrace_profile_ops __read_mostly = {
917 .func = function_profile_call, 926 .func = function_profile_call,
918 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 927 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
928 INIT_REGEX_LOCK(ftrace_profile_ops)
919}; 929};
920 930
921static int register_ftrace_profiler(void) 931static int register_ftrace_profiler(void)
@@ -1053,6 +1063,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1053 1063
1054static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1064static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1055 1065
1066loff_t
1067ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
1068{
1069 loff_t ret;
1070
1071 if (file->f_mode & FMODE_READ)
1072 ret = seq_lseek(file, offset, whence);
1073 else
1074 file->f_pos = ret = 1;
1075
1076 return ret;
1077}
1078
1056#ifdef CONFIG_DYNAMIC_FTRACE 1079#ifdef CONFIG_DYNAMIC_FTRACE
1057 1080
1058#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1081#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1067,7 +1090,7 @@ struct ftrace_func_probe {
1067 unsigned long flags; 1090 unsigned long flags;
1068 unsigned long ip; 1091 unsigned long ip;
1069 void *data; 1092 void *data;
1070 struct rcu_head rcu; 1093 struct list_head free_list;
1071}; 1094};
1072 1095
1073struct ftrace_func_entry { 1096struct ftrace_func_entry {
@@ -1098,11 +1121,10 @@ static struct ftrace_ops global_ops = {
1098 .func = ftrace_stub, 1121 .func = ftrace_stub,
1099 .notrace_hash = EMPTY_HASH, 1122 .notrace_hash = EMPTY_HASH,
1100 .filter_hash = EMPTY_HASH, 1123 .filter_hash = EMPTY_HASH,
1101 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 1124 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
1125 INIT_REGEX_LOCK(global_ops)
1102}; 1126};
1103 1127
1104static DEFINE_MUTEX(ftrace_regex_lock);
1105
1106struct ftrace_page { 1128struct ftrace_page {
1107 struct ftrace_page *next; 1129 struct ftrace_page *next;
1108 struct dyn_ftrace *records; 1130 struct dyn_ftrace *records;
@@ -1143,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1143 1165
1144 hhd = &hash->buckets[key]; 1166 hhd = &hash->buckets[key];
1145 1167
1146 hlist_for_each_entry_rcu(entry, hhd, hlist) { 1168 hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
1147 if (entry->ip == ip) 1169 if (entry->ip == ip)
1148 return entry; 1170 return entry;
1149 } 1171 }
@@ -1242,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1242 1264
1243void ftrace_free_filter(struct ftrace_ops *ops) 1265void ftrace_free_filter(struct ftrace_ops *ops)
1244{ 1266{
1267 ftrace_ops_init(ops);
1245 free_ftrace_hash(ops->filter_hash); 1268 free_ftrace_hash(ops->filter_hash);
1246 free_ftrace_hash(ops->notrace_hash); 1269 free_ftrace_hash(ops->notrace_hash);
1247} 1270}
@@ -1317,7 +1340,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1317 struct hlist_head *hhd; 1340 struct hlist_head *hhd;
1318 struct ftrace_hash *old_hash; 1341 struct ftrace_hash *old_hash;
1319 struct ftrace_hash *new_hash; 1342 struct ftrace_hash *new_hash;
1320 unsigned long key;
1321 int size = src->count; 1343 int size = src->count;
1322 int bits = 0; 1344 int bits = 0;
1323 int ret; 1345 int ret;
@@ -1360,10 +1382,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1360 for (i = 0; i < size; i++) { 1382 for (i = 0; i < size; i++) {
1361 hhd = &src->buckets[i]; 1383 hhd = &src->buckets[i];
1362 hlist_for_each_entry_safe(entry, tn, hhd, hlist) { 1384 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1363 if (bits > 0)
1364 key = hash_long(entry->ip, bits);
1365 else
1366 key = 0;
1367 remove_hash_entry(src, entry); 1385 remove_hash_entry(src, entry);
1368 __add_hash_entry(new_hash, entry); 1386 __add_hash_entry(new_hash, entry);
1369 } 1387 }
@@ -1404,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1404 struct ftrace_hash *notrace_hash; 1422 struct ftrace_hash *notrace_hash;
1405 int ret; 1423 int ret;
1406 1424
1407 filter_hash = rcu_dereference_raw(ops->filter_hash); 1425 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
1408 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1426 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
1409 1427
1410 if ((ftrace_hash_empty(filter_hash) || 1428 if ((ftrace_hash_empty(filter_hash) ||
1411 ftrace_lookup_ip(filter_hash, ip)) && 1429 ftrace_lookup_ip(filter_hash, ip)) &&
@@ -2441,7 +2459,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
2441 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || 2459 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
2442 2460
2443 ((iter->flags & FTRACE_ITER_ENABLED) && 2461 ((iter->flags & FTRACE_ITER_ENABLED) &&
2444 !(rec->flags & ~FTRACE_FL_MASK))) { 2462 !(rec->flags & FTRACE_FL_ENABLED))) {
2445 2463
2446 rec = NULL; 2464 rec = NULL;
2447 goto retry; 2465 goto retry;
@@ -2613,7 +2631,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2613 * routine, you can use ftrace_filter_write() for the write 2631 * routine, you can use ftrace_filter_write() for the write
2614 * routine if @flag has FTRACE_ITER_FILTER set, or 2632 * routine if @flag has FTRACE_ITER_FILTER set, or
2615 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2633 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2616 * ftrace_regex_lseek() should be used as the lseek routine, and 2634 * ftrace_filter_lseek() should be used as the lseek routine, and
2617 * release must call ftrace_regex_release(). 2635 * release must call ftrace_regex_release().
2618 */ 2636 */
2619int 2637int
@@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2624 struct ftrace_hash *hash; 2642 struct ftrace_hash *hash;
2625 int ret = 0; 2643 int ret = 0;
2626 2644
2645 ftrace_ops_init(ops);
2646
2627 if (unlikely(ftrace_disabled)) 2647 if (unlikely(ftrace_disabled))
2628 return -ENODEV; 2648 return -ENODEV;
2629 2649
@@ -2636,28 +2656,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2636 return -ENOMEM; 2656 return -ENOMEM;
2637 } 2657 }
2638 2658
2659 iter->ops = ops;
2660 iter->flags = flag;
2661
2662 mutex_lock(&ops->regex_lock);
2663
2639 if (flag & FTRACE_ITER_NOTRACE) 2664 if (flag & FTRACE_ITER_NOTRACE)
2640 hash = ops->notrace_hash; 2665 hash = ops->notrace_hash;
2641 else 2666 else
2642 hash = ops->filter_hash; 2667 hash = ops->filter_hash;
2643 2668
2644 iter->ops = ops;
2645 iter->flags = flag;
2646
2647 if (file->f_mode & FMODE_WRITE) { 2669 if (file->f_mode & FMODE_WRITE) {
2648 mutex_lock(&ftrace_lock);
2649 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); 2670 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2650 mutex_unlock(&ftrace_lock);
2651
2652 if (!iter->hash) { 2671 if (!iter->hash) {
2653 trace_parser_put(&iter->parser); 2672 trace_parser_put(&iter->parser);
2654 kfree(iter); 2673 kfree(iter);
2655 return -ENOMEM; 2674 ret = -ENOMEM;
2675 goto out_unlock;
2656 } 2676 }
2657 } 2677 }
2658 2678
2659 mutex_lock(&ftrace_regex_lock);
2660
2661 if ((file->f_mode & FMODE_WRITE) && 2679 if ((file->f_mode & FMODE_WRITE) &&
2662 (file->f_flags & O_TRUNC)) 2680 (file->f_flags & O_TRUNC))
2663 ftrace_filter_reset(iter->hash); 2681 ftrace_filter_reset(iter->hash);
@@ -2677,7 +2695,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2677 } 2695 }
2678 } else 2696 } else
2679 file->private_data = iter; 2697 file->private_data = iter;
2680 mutex_unlock(&ftrace_regex_lock); 2698
2699 out_unlock:
2700 mutex_unlock(&ops->regex_lock);
2681 2701
2682 return ret; 2702 return ret;
2683} 2703}
@@ -2697,19 +2717,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2697 inode, file); 2717 inode, file);
2698} 2718}
2699 2719
2700loff_t
2701ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2702{
2703 loff_t ret;
2704
2705 if (file->f_mode & FMODE_READ)
2706 ret = seq_lseek(file, offset, whence);
2707 else
2708 file->f_pos = ret = 1;
2709
2710 return ret;
2711}
2712
2713static int ftrace_match(char *str, char *regex, int len, int type) 2720static int ftrace_match(char *str, char *regex, int len, int type)
2714{ 2721{
2715 int matched = 0; 2722 int matched = 0;
@@ -2913,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2913 * on the hash. rcu_read_lock is too dangerous here. 2920 * on the hash. rcu_read_lock is too dangerous here.
2914 */ 2921 */
2915 preempt_disable_notrace(); 2922 preempt_disable_notrace();
2916 hlist_for_each_entry_rcu(entry, hhd, node) { 2923 hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
2917 if (entry->ip == ip) 2924 if (entry->ip == ip)
2918 entry->ops->func(ip, parent_ip, &entry->data); 2925 entry->ops->func(ip, parent_ip, &entry->data);
2919 } 2926 }
@@ -2923,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2923static struct ftrace_ops trace_probe_ops __read_mostly = 2930static struct ftrace_ops trace_probe_ops __read_mostly =
2924{ 2931{
2925 .func = function_trace_probe_call, 2932 .func = function_trace_probe_call,
2933 .flags = FTRACE_OPS_FL_INITIALIZED,
2934 INIT_REGEX_LOCK(trace_probe_ops)
2926}; 2935};
2927 2936
2928static int ftrace_probe_registered; 2937static int ftrace_probe_registered;
@@ -2932,8 +2941,12 @@ static void __enable_ftrace_function_probe(void)
2932 int ret; 2941 int ret;
2933 int i; 2942 int i;
2934 2943
2935 if (ftrace_probe_registered) 2944 if (ftrace_probe_registered) {
2945 /* still need to update the function call sites */
2946 if (ftrace_enabled)
2947 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2936 return; 2948 return;
2949 }
2937 2950
2938 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 2951 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
2939 struct hlist_head *hhd = &ftrace_func_hash[i]; 2952 struct hlist_head *hhd = &ftrace_func_hash[i];
@@ -2974,28 +2987,27 @@ static void __disable_ftrace_function_probe(void)
2974} 2987}
2975 2988
2976 2989
2977static void ftrace_free_entry_rcu(struct rcu_head *rhp) 2990static void ftrace_free_entry(struct ftrace_func_probe *entry)
2978{ 2991{
2979 struct ftrace_func_probe *entry =
2980 container_of(rhp, struct ftrace_func_probe, rcu);
2981
2982 if (entry->ops->free) 2992 if (entry->ops->free)
2983 entry->ops->free(&entry->data); 2993 entry->ops->free(entry->ops, entry->ip, &entry->data);
2984 kfree(entry); 2994 kfree(entry);
2985} 2995}
2986 2996
2987
2988int 2997int
2989register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 2998register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2990 void *data) 2999 void *data)
2991{ 3000{
2992 struct ftrace_func_probe *entry; 3001 struct ftrace_func_probe *entry;
3002 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
3003 struct ftrace_hash *hash;
2993 struct ftrace_page *pg; 3004 struct ftrace_page *pg;
2994 struct dyn_ftrace *rec; 3005 struct dyn_ftrace *rec;
2995 int type, len, not; 3006 int type, len, not;
2996 unsigned long key; 3007 unsigned long key;
2997 int count = 0; 3008 int count = 0;
2998 char *search; 3009 char *search;
3010 int ret;
2999 3011
3000 type = filter_parse_regex(glob, strlen(glob), &search, &not); 3012 type = filter_parse_regex(glob, strlen(glob), &search, &not);
3001 len = strlen(search); 3013 len = strlen(search);
@@ -3004,10 +3016,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3004 if (WARN_ON(not)) 3016 if (WARN_ON(not))
3005 return -EINVAL; 3017 return -EINVAL;
3006 3018
3007 mutex_lock(&ftrace_lock); 3019 mutex_lock(&trace_probe_ops.regex_lock);
3008 3020
3009 if (unlikely(ftrace_disabled)) 3021 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3010 goto out_unlock; 3022 if (!hash) {
3023 count = -ENOMEM;
3024 goto out;
3025 }
3026
3027 if (unlikely(ftrace_disabled)) {
3028 count = -ENODEV;
3029 goto out;
3030 }
3031
3032 mutex_lock(&ftrace_lock);
3011 3033
3012 do_for_each_ftrace_rec(pg, rec) { 3034 do_for_each_ftrace_rec(pg, rec) {
3013 3035
@@ -3031,14 +3053,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3031 * for each function we find. We call the callback 3053 * for each function we find. We call the callback
3032 * to give the caller an opportunity to do so. 3054 * to give the caller an opportunity to do so.
3033 */ 3055 */
3034 if (ops->callback) { 3056 if (ops->init) {
3035 if (ops->callback(rec->ip, &entry->data) < 0) { 3057 if (ops->init(ops, rec->ip, &entry->data) < 0) {
3036 /* caller does not like this func */ 3058 /* caller does not like this func */
3037 kfree(entry); 3059 kfree(entry);
3038 continue; 3060 continue;
3039 } 3061 }
3040 } 3062 }
3041 3063
3064 ret = enter_record(hash, rec, 0);
3065 if (ret < 0) {
3066 kfree(entry);
3067 count = ret;
3068 goto out_unlock;
3069 }
3070
3042 entry->ops = ops; 3071 entry->ops = ops;
3043 entry->ip = rec->ip; 3072 entry->ip = rec->ip;
3044 3073
@@ -3046,10 +3075,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3046 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); 3075 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3047 3076
3048 } while_for_each_ftrace_rec(); 3077 } while_for_each_ftrace_rec();
3078
3079 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3080 if (ret < 0)
3081 count = ret;
3082
3049 __enable_ftrace_function_probe(); 3083 __enable_ftrace_function_probe();
3050 3084
3051 out_unlock: 3085 out_unlock:
3052 mutex_unlock(&ftrace_lock); 3086 mutex_unlock(&ftrace_lock);
3087 out:
3088 mutex_unlock(&trace_probe_ops.regex_lock);
3089 free_ftrace_hash(hash);
3053 3090
3054 return count; 3091 return count;
3055} 3092}
@@ -3063,7 +3100,12 @@ static void
3063__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3100__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3064 void *data, int flags) 3101 void *data, int flags)
3065{ 3102{
3103 struct ftrace_func_entry *rec_entry;
3066 struct ftrace_func_probe *entry; 3104 struct ftrace_func_probe *entry;
3105 struct ftrace_func_probe *p;
3106 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
3107 struct list_head free_list;
3108 struct ftrace_hash *hash;
3067 struct hlist_node *tmp; 3109 struct hlist_node *tmp;
3068 char str[KSYM_SYMBOL_LEN]; 3110 char str[KSYM_SYMBOL_LEN];
3069 int type = MATCH_FULL; 3111 int type = MATCH_FULL;
@@ -3083,7 +3125,15 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3083 return; 3125 return;
3084 } 3126 }
3085 3127
3086 mutex_lock(&ftrace_lock); 3128 mutex_lock(&trace_probe_ops.regex_lock);
3129
3130 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3131 if (!hash)
3132 /* Hmm, should report this somehow */
3133 goto out_unlock;
3134
3135 INIT_LIST_HEAD(&free_list);
3136
3087 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3137 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3088 struct hlist_head *hhd = &ftrace_func_hash[i]; 3138 struct hlist_head *hhd = &ftrace_func_hash[i];
3089 3139
@@ -3104,12 +3154,32 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3104 continue; 3154 continue;
3105 } 3155 }
3106 3156
3107 hlist_del(&entry->node); 3157 rec_entry = ftrace_lookup_ip(hash, entry->ip);
3108 call_rcu(&entry->rcu, ftrace_free_entry_rcu); 3158 /* It is possible more than one entry had this ip */
3159 if (rec_entry)
3160 free_hash_entry(hash, rec_entry);
3161
3162 hlist_del_rcu(&entry->node);
3163 list_add(&entry->free_list, &free_list);
3109 } 3164 }
3110 } 3165 }
3166 mutex_lock(&ftrace_lock);
3111 __disable_ftrace_function_probe(); 3167 __disable_ftrace_function_probe();
3168 /*
3169 * Remove after the disable is called. Otherwise, if the last
3170 * probe is removed, a null hash means *all enabled*.
3171 */
3172 ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3173 synchronize_sched();
3174 list_for_each_entry_safe(entry, p, &free_list, free_list) {
3175 list_del(&entry->free_list);
3176 ftrace_free_entry(entry);
3177 }
3112 mutex_unlock(&ftrace_lock); 3178 mutex_unlock(&ftrace_lock);
3179
3180 out_unlock:
3181 mutex_unlock(&trace_probe_ops.regex_lock);
3182 free_ftrace_hash(hash);
3113} 3183}
3114 3184
3115void 3185void
@@ -3218,18 +3288,17 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
3218 if (!cnt) 3288 if (!cnt)
3219 return 0; 3289 return 0;
3220 3290
3221 mutex_lock(&ftrace_regex_lock);
3222
3223 ret = -ENODEV;
3224 if (unlikely(ftrace_disabled))
3225 goto out_unlock;
3226
3227 if (file->f_mode & FMODE_READ) { 3291 if (file->f_mode & FMODE_READ) {
3228 struct seq_file *m = file->private_data; 3292 struct seq_file *m = file->private_data;
3229 iter = m->private; 3293 iter = m->private;
3230 } else 3294 } else
3231 iter = file->private_data; 3295 iter = file->private_data;
3232 3296
3297 if (unlikely(ftrace_disabled))
3298 return -ENODEV;
3299
3300 /* iter->hash is a local copy, so we don't need regex_lock */
3301
3233 parser = &iter->parser; 3302 parser = &iter->parser;
3234 read = trace_get_user(parser, ubuf, cnt, ppos); 3303 read = trace_get_user(parser, ubuf, cnt, ppos);
3235 3304
@@ -3238,14 +3307,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
3238 ret = ftrace_process_regex(iter->hash, parser->buffer, 3307 ret = ftrace_process_regex(iter->hash, parser->buffer,
3239 parser->idx, enable); 3308 parser->idx, enable);
3240 trace_parser_clear(parser); 3309 trace_parser_clear(parser);
3241 if (ret) 3310 if (ret < 0)
3242 goto out_unlock; 3311 goto out;
3243 } 3312 }
3244 3313
3245 ret = read; 3314 ret = read;
3246out_unlock: 3315 out:
3247 mutex_unlock(&ftrace_regex_lock);
3248
3249 return ret; 3316 return ret;
3250} 3317}
3251 3318
@@ -3297,16 +3364,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3297 if (unlikely(ftrace_disabled)) 3364 if (unlikely(ftrace_disabled))
3298 return -ENODEV; 3365 return -ENODEV;
3299 3366
3367 mutex_lock(&ops->regex_lock);
3368
3300 if (enable) 3369 if (enable)
3301 orig_hash = &ops->filter_hash; 3370 orig_hash = &ops->filter_hash;
3302 else 3371 else
3303 orig_hash = &ops->notrace_hash; 3372 orig_hash = &ops->notrace_hash;
3304 3373
3305 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3374 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3306 if (!hash) 3375 if (!hash) {
3307 return -ENOMEM; 3376 ret = -ENOMEM;
3377 goto out_regex_unlock;
3378 }
3308 3379
3309 mutex_lock(&ftrace_regex_lock);
3310 if (reset) 3380 if (reset)
3311 ftrace_filter_reset(hash); 3381 ftrace_filter_reset(hash);
3312 if (buf && !ftrace_match_records(hash, buf, len)) { 3382 if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3328,7 +3398,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3328 mutex_unlock(&ftrace_lock); 3398 mutex_unlock(&ftrace_lock);
3329 3399
3330 out_regex_unlock: 3400 out_regex_unlock:
3331 mutex_unlock(&ftrace_regex_lock); 3401 mutex_unlock(&ops->regex_lock);
3332 3402
3333 free_ftrace_hash(hash); 3403 free_ftrace_hash(hash);
3334 return ret; 3404 return ret;
@@ -3354,6 +3424,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
3354int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, 3424int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
3355 int remove, int reset) 3425 int remove, int reset)
3356{ 3426{
3427 ftrace_ops_init(ops);
3357 return ftrace_set_addr(ops, ip, remove, reset, 1); 3428 return ftrace_set_addr(ops, ip, remove, reset, 1);
3358} 3429}
3359EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); 3430EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
@@ -3378,6 +3449,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3378int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 3449int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
3379 int len, int reset) 3450 int len, int reset)
3380{ 3451{
3452 ftrace_ops_init(ops);
3381 return ftrace_set_regex(ops, buf, len, reset, 1); 3453 return ftrace_set_regex(ops, buf, len, reset, 1);
3382} 3454}
3383EXPORT_SYMBOL_GPL(ftrace_set_filter); 3455EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3396,6 +3468,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
3396int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 3468int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3397 int len, int reset) 3469 int len, int reset)
3398{ 3470{
3471 ftrace_ops_init(ops);
3399 return ftrace_set_regex(ops, buf, len, reset, 0); 3472 return ftrace_set_regex(ops, buf, len, reset, 0);
3400} 3473}
3401EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3474EXPORT_SYMBOL_GPL(ftrace_set_notrace);
@@ -3441,14 +3514,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3441 3514
3442static int __init set_ftrace_notrace(char *str) 3515static int __init set_ftrace_notrace(char *str)
3443{ 3516{
3444 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3517 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3445 return 1; 3518 return 1;
3446} 3519}
3447__setup("ftrace_notrace=", set_ftrace_notrace); 3520__setup("ftrace_notrace=", set_ftrace_notrace);
3448 3521
3449static int __init set_ftrace_filter(char *str) 3522static int __init set_ftrace_filter(char *str)
3450{ 3523{
3451 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3524 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3452 return 1; 3525 return 1;
3453} 3526}
3454__setup("ftrace_filter=", set_ftrace_filter); 3527__setup("ftrace_filter=", set_ftrace_filter);
@@ -3486,6 +3559,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3486{ 3559{
3487 char *func; 3560 char *func;
3488 3561
3562 ftrace_ops_init(ops);
3563
3489 while (buf) { 3564 while (buf) {
3490 func = strsep(&buf, ","); 3565 func = strsep(&buf, ",");
3491 ftrace_set_regex(ops, func, strlen(func), 0, enable); 3566 ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -3513,10 +3588,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3513 int filter_hash; 3588 int filter_hash;
3514 int ret; 3589 int ret;
3515 3590
3516 mutex_lock(&ftrace_regex_lock);
3517 if (file->f_mode & FMODE_READ) { 3591 if (file->f_mode & FMODE_READ) {
3518 iter = m->private; 3592 iter = m->private;
3519
3520 seq_release(inode, file); 3593 seq_release(inode, file);
3521 } else 3594 } else
3522 iter = file->private_data; 3595 iter = file->private_data;
@@ -3529,6 +3602,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3529 3602
3530 trace_parser_put(parser); 3603 trace_parser_put(parser);
3531 3604
3605 mutex_lock(&iter->ops->regex_lock);
3606
3532 if (file->f_mode & FMODE_WRITE) { 3607 if (file->f_mode & FMODE_WRITE) {
3533 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); 3608 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3534 3609
@@ -3546,10 +3621,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3546 3621
3547 mutex_unlock(&ftrace_lock); 3622 mutex_unlock(&ftrace_lock);
3548 } 3623 }
3624
3625 mutex_unlock(&iter->ops->regex_lock);
3549 free_ftrace_hash(iter->hash); 3626 free_ftrace_hash(iter->hash);
3550 kfree(iter); 3627 kfree(iter);
3551 3628
3552 mutex_unlock(&ftrace_regex_lock);
3553 return 0; 3629 return 0;
3554} 3630}
3555 3631
@@ -3571,7 +3647,7 @@ static const struct file_operations ftrace_filter_fops = {
3571 .open = ftrace_filter_open, 3647 .open = ftrace_filter_open,
3572 .read = seq_read, 3648 .read = seq_read,
3573 .write = ftrace_filter_write, 3649 .write = ftrace_filter_write,
3574 .llseek = ftrace_regex_lseek, 3650 .llseek = ftrace_filter_lseek,
3575 .release = ftrace_regex_release, 3651 .release = ftrace_regex_release,
3576}; 3652};
3577 3653
@@ -3579,7 +3655,7 @@ static const struct file_operations ftrace_notrace_fops = {
3579 .open = ftrace_notrace_open, 3655 .open = ftrace_notrace_open,
3580 .read = seq_read, 3656 .read = seq_read,
3581 .write = ftrace_notrace_write, 3657 .write = ftrace_notrace_write,
3582 .llseek = ftrace_regex_lseek, 3658 .llseek = ftrace_filter_lseek,
3583 .release = ftrace_regex_release, 3659 .release = ftrace_regex_release,
3584}; 3660};
3585 3661
@@ -3737,7 +3813,8 @@ out:
3737 if (fail) 3813 if (fail)
3738 return -EINVAL; 3814 return -EINVAL;
3739 3815
3740 ftrace_graph_filter_enabled = 1; 3816 ftrace_graph_filter_enabled = !!(*idx);
3817
3741 return 0; 3818 return 0;
3742} 3819}
3743 3820
@@ -3784,8 +3861,8 @@ static const struct file_operations ftrace_graph_fops = {
3784 .open = ftrace_graph_open, 3861 .open = ftrace_graph_open,
3785 .read = seq_read, 3862 .read = seq_read,
3786 .write = ftrace_graph_write, 3863 .write = ftrace_graph_write,
3864 .llseek = ftrace_filter_lseek,
3787 .release = ftrace_graph_release, 3865 .release = ftrace_graph_release,
3788 .llseek = seq_lseek,
3789}; 3866};
3790#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3867#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3791 3868
@@ -4087,7 +4164,8 @@ void __init ftrace_init(void)
4087 4164
4088static struct ftrace_ops global_ops = { 4165static struct ftrace_ops global_ops = {
4089 .func = ftrace_stub, 4166 .func = ftrace_stub,
4090 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 4167 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
4168 INIT_REGEX_LOCK(global_ops)
4091}; 4169};
4092 4170
4093static int __init ftrace_nodyn_init(void) 4171static int __init ftrace_nodyn_init(void)
@@ -4131,7 +4209,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4131 preempt_disable_notrace(); 4209 preempt_disable_notrace();
4132 trace_recursion_set(TRACE_CONTROL_BIT); 4210 trace_recursion_set(TRACE_CONTROL_BIT);
4133 do_for_each_ftrace_op(op, ftrace_control_list) { 4211 do_for_each_ftrace_op(op, ftrace_control_list) {
4134 if (!ftrace_function_local_disabled(op) && 4212 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4213 !ftrace_function_local_disabled(op) &&
4135 ftrace_ops_test(op, ip)) 4214 ftrace_ops_test(op, ip))
4136 op->func(ip, parent_ip, op, regs); 4215 op->func(ip, parent_ip, op, regs);
4137 } while_for_each_ftrace_op(op); 4216 } while_for_each_ftrace_op(op);
@@ -4140,8 +4219,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4140} 4219}
4141 4220
4142static struct ftrace_ops control_ops = { 4221static struct ftrace_ops control_ops = {
4143 .func = ftrace_ops_control_func, 4222 .func = ftrace_ops_control_func,
4144 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 4223 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
4224 INIT_REGEX_LOCK(control_ops)
4145}; 4225};
4146 4226
4147static inline void 4227static inline void
@@ -4439,7 +4519,7 @@ static const struct file_operations ftrace_pid_fops = {
4439 .open = ftrace_pid_open, 4519 .open = ftrace_pid_open,
4440 .write = ftrace_pid_write, 4520 .write = ftrace_pid_write,
4441 .read = seq_read, 4521 .read = seq_read,
4442 .llseek = seq_lseek, 4522 .llseek = ftrace_filter_lseek,
4443 .release = ftrace_pid_release, 4523 .release = ftrace_pid_release,
4444}; 4524};
4445 4525
@@ -4499,6 +4579,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
4499{ 4579{
4500 int ret = -1; 4580 int ret = -1;
4501 4581
4582 ftrace_ops_init(ops);
4583
4502 mutex_lock(&ftrace_lock); 4584 mutex_lock(&ftrace_lock);
4503 4585
4504 ret = __register_ftrace_function(ops); 4586 ret = __register_ftrace_function(ops);
@@ -4555,12 +4637,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4555 ftrace_startup_sysctl(); 4637 ftrace_startup_sysctl();
4556 4638
4557 /* we are starting ftrace again */ 4639 /* we are starting ftrace again */
4558 if (ftrace_ops_list != &ftrace_list_end) { 4640 if (ftrace_ops_list != &ftrace_list_end)
4559 if (ftrace_ops_list->next == &ftrace_list_end) 4641 update_ftrace_function();
4560 ftrace_trace_function = ftrace_ops_list->func;
4561 else
4562 ftrace_trace_function = ftrace_ops_list_func;
4563 }
4564 4642
4565 } else { 4643 } else {
4566 /* stopping ftrace calls (just send to ftrace_stub) */ 4644 /* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6989df2ba194..e444ff88f0a4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,13 +8,16 @@
8#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h>
11#include <linux/debugfs.h> 12#include <linux/debugfs.h>
12#include <linux/uaccess.h> 13#include <linux/uaccess.h>
13#include <linux/hardirq.h> 14#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */
14#include <linux/kmemcheck.h> 16#include <linux/kmemcheck.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/percpu.h> 18#include <linux/percpu.h>
17#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/delay.h>
18#include <linux/slab.h> 21#include <linux/slab.h>
19#include <linux/init.h> 22#include <linux/init.h>
20#include <linux/hash.h> 23#include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
444 return ret; 447 return ret;
445} 448}
446 449
450struct rb_irq_work {
451 struct irq_work work;
452 wait_queue_head_t waiters;
453 bool waiters_pending;
454};
455
447/* 456/*
448 * head_page == tail_page && head == tail then buffer is empty. 457 * head_page == tail_page && head == tail then buffer is empty.
449 */ 458 */
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
478 struct list_head new_pages; /* new pages to add */ 487 struct list_head new_pages; /* new pages to add */
479 struct work_struct update_pages_work; 488 struct work_struct update_pages_work;
480 struct completion update_done; 489 struct completion update_done;
490
491 struct rb_irq_work irq_work;
481}; 492};
482 493
483struct ring_buffer { 494struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
497 struct notifier_block cpu_notify; 508 struct notifier_block cpu_notify;
498#endif 509#endif
499 u64 (*clock)(void); 510 u64 (*clock)(void);
511
512 struct rb_irq_work irq_work;
500}; 513};
501 514
502struct ring_buffer_iter { 515struct ring_buffer_iter {
@@ -508,6 +521,121 @@ struct ring_buffer_iter {
508 u64 read_stamp; 521 u64 read_stamp;
509}; 522};
510 523
524/*
525 * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
526 *
527 * Schedules a delayed work to wake up any task that is blocked on the
528 * ring buffer waiters queue.
529 */
530static void rb_wake_up_waiters(struct irq_work *work)
531{
532 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
533
534 wake_up_all(&rbwork->waiters);
535}
536
537/**
538 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on
541 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer.
545 */
546void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
547{
548 struct ring_buffer_per_cpu *cpu_buffer;
549 DEFINE_WAIT(wait);
550 struct rb_irq_work *work;
551
552 /*
553 * Depending on what the caller is waiting for, either any
554 * data in any cpu buffer, or a specific buffer, put the
555 * caller on the appropriate wait queue.
556 */
557 if (cpu == RING_BUFFER_ALL_CPUS)
558 work = &buffer->irq_work;
559 else {
560 cpu_buffer = buffer->buffers[cpu];
561 work = &cpu_buffer->irq_work;
562 }
563
564
565 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
566
567 /*
568 * The events can happen in critical sections where
569 * checking a work queue can cause deadlocks.
570 * After adding a task to the queue, this flag is set
571 * only to notify events to try to wake up the queue
572 * using irq_work.
573 *
574 * We don't clear it even if the buffer is no longer
575 * empty. The flag only causes the next event to run
576 * irq_work to do the work queue wake up. The worse
577 * that can happen if we race with !trace_empty() is that
578 * an event will cause an irq_work to try to wake up
579 * an empty queue.
580 *
581 * There's no reason to protect this flag either, as
582 * the work queue and irq_work logic will do the necessary
583 * synchronization for the wake ups. The only thing
584 * that is necessary is that the wake up happens after
585 * a task has been queued. It's OK for spurious wake ups.
586 */
587 work->waiters_pending = true;
588
589 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
590 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
591 schedule();
592
593 finish_wait(&work->waiters, &wait);
594}
595
596/**
597 * ring_buffer_poll_wait - poll on buffer input
598 * @buffer: buffer to wait on
599 * @cpu: the cpu buffer to wait on
600 * @filp: the file descriptor
601 * @poll_table: The poll descriptor
602 *
603 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
604 * as data is added to any of the @buffer's cpu buffers. Otherwise
605 * it will wait for data to be added to a specific cpu buffer.
606 *
607 * Returns POLLIN | POLLRDNORM if data exists in the buffers,
608 * zero otherwise.
609 */
610int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
611 struct file *filp, poll_table *poll_table)
612{
613 struct ring_buffer_per_cpu *cpu_buffer;
614 struct rb_irq_work *work;
615
616 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
617 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
618 return POLLIN | POLLRDNORM;
619
620 if (cpu == RING_BUFFER_ALL_CPUS)
621 work = &buffer->irq_work;
622 else {
623 if (!cpumask_test_cpu(cpu, buffer->cpumask))
624 return -EINVAL;
625
626 cpu_buffer = buffer->buffers[cpu];
627 work = &cpu_buffer->irq_work;
628 }
629
630 work->waiters_pending = true;
631 poll_wait(filp, &work->waiters, poll_table);
632
633 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
634 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
635 return POLLIN | POLLRDNORM;
636 return 0;
637}
638
511/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 639/* buffer may be either ring_buffer or ring_buffer_per_cpu */
512#define RB_WARN_ON(b, cond) \ 640#define RB_WARN_ON(b, cond) \
513 ({ \ 641 ({ \
@@ -1063,6 +1191,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1063 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1191 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1064 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); 1192 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1065 init_completion(&cpu_buffer->update_done); 1193 init_completion(&cpu_buffer->update_done);
1194 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1195 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1066 1196
1067 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1197 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1068 GFP_KERNEL, cpu_to_node(cpu)); 1198 GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1288,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1158 buffer->clock = trace_clock_local; 1288 buffer->clock = trace_clock_local;
1159 buffer->reader_lock_key = key; 1289 buffer->reader_lock_key = key;
1160 1290
1291 init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
1292 init_waitqueue_head(&buffer->irq_work.waiters);
1293
1161 /* need at least two pages */ 1294 /* need at least two pages */
1162 if (nr_pages < 2) 1295 if (nr_pages < 2)
1163 nr_pages = 2; 1296 nr_pages = 2;
@@ -1553,11 +1686,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1553 if (!cpu_buffer->nr_pages_to_update) 1686 if (!cpu_buffer->nr_pages_to_update)
1554 continue; 1687 continue;
1555 1688
1556 if (cpu_online(cpu)) 1689 /* The update must run on the CPU that is being updated. */
1690 preempt_disable();
1691 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
1692 rb_update_pages(cpu_buffer);
1693 cpu_buffer->nr_pages_to_update = 0;
1694 } else {
1695 /*
1696 * Can not disable preemption for schedule_work_on()
1697 * on PREEMPT_RT.
1698 */
1699 preempt_enable();
1557 schedule_work_on(cpu, 1700 schedule_work_on(cpu,
1558 &cpu_buffer->update_pages_work); 1701 &cpu_buffer->update_pages_work);
1559 else 1702 preempt_disable();
1560 rb_update_pages(cpu_buffer); 1703 }
1704 preempt_enable();
1561 } 1705 }
1562 1706
1563 /* wait for all the updates to complete */ 1707 /* wait for all the updates to complete */
@@ -1595,12 +1739,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1595 1739
1596 get_online_cpus(); 1740 get_online_cpus();
1597 1741
1598 if (cpu_online(cpu_id)) { 1742 preempt_disable();
1743 /* The update must run on the CPU that is being updated. */
1744 if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
1745 rb_update_pages(cpu_buffer);
1746 else {
1747 /*
1748 * Can not disable preemption for schedule_work_on()
1749 * on PREEMPT_RT.
1750 */
1751 preempt_enable();
1599 schedule_work_on(cpu_id, 1752 schedule_work_on(cpu_id,
1600 &cpu_buffer->update_pages_work); 1753 &cpu_buffer->update_pages_work);
1601 wait_for_completion(&cpu_buffer->update_done); 1754 wait_for_completion(&cpu_buffer->update_done);
1602 } else 1755 preempt_disable();
1603 rb_update_pages(cpu_buffer); 1756 }
1757 preempt_enable();
1604 1758
1605 cpu_buffer->nr_pages_to_update = 0; 1759 cpu_buffer->nr_pages_to_update = 0;
1606 put_online_cpus(); 1760 put_online_cpus();
@@ -2612,6 +2766,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2612 rb_end_commit(cpu_buffer); 2766 rb_end_commit(cpu_buffer);
2613} 2767}
2614 2768
2769static __always_inline void
2770rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2771{
2772 if (buffer->irq_work.waiters_pending) {
2773 buffer->irq_work.waiters_pending = false;
2774 /* irq_work_queue() supplies it's own memory barriers */
2775 irq_work_queue(&buffer->irq_work.work);
2776 }
2777
2778 if (cpu_buffer->irq_work.waiters_pending) {
2779 cpu_buffer->irq_work.waiters_pending = false;
2780 /* irq_work_queue() supplies it's own memory barriers */
2781 irq_work_queue(&cpu_buffer->irq_work.work);
2782 }
2783}
2784
2615/** 2785/**
2616 * ring_buffer_unlock_commit - commit a reserved 2786 * ring_buffer_unlock_commit - commit a reserved
2617 * @buffer: The buffer to commit to 2787 * @buffer: The buffer to commit to
@@ -2631,6 +2801,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2631 2801
2632 rb_commit(cpu_buffer, event); 2802 rb_commit(cpu_buffer, event);
2633 2803
2804 rb_wakeups(buffer, cpu_buffer);
2805
2634 trace_recursive_unlock(); 2806 trace_recursive_unlock();
2635 2807
2636 preempt_enable_notrace(); 2808 preempt_enable_notrace();
@@ -2803,6 +2975,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
2803 2975
2804 rb_commit(cpu_buffer, event); 2976 rb_commit(cpu_buffer, event);
2805 2977
2978 rb_wakeups(buffer, cpu_buffer);
2979
2806 ret = 0; 2980 ret = 0;
2807 out: 2981 out:
2808 preempt_enable_notrace(); 2982 preempt_enable_notrace();
@@ -4467,3 +4641,320 @@ static int rb_cpu_notify(struct notifier_block *self,
4467 return NOTIFY_OK; 4641 return NOTIFY_OK;
4468} 4642}
4469#endif 4643#endif
4644
4645#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
4646/*
4647 * This is a basic integrity check of the ring buffer.
4648 * Late in the boot cycle this test will run when configured in.
4649 * It will kick off a thread per CPU that will go into a loop
4650 * writing to the per cpu ring buffer various sizes of data.
4651 * Some of the data will be large items, some small.
4652 *
4653 * Another thread is created that goes into a spin, sending out
4654 * IPIs to the other CPUs to also write into the ring buffer.
4655 * this is to test the nesting ability of the buffer.
4656 *
4657 * Basic stats are recorded and reported. If something in the
4658 * ring buffer should happen that's not expected, a big warning
4659 * is displayed and all ring buffers are disabled.
4660 */
4661static struct task_struct *rb_threads[NR_CPUS] __initdata;
4662
4663struct rb_test_data {
4664 struct ring_buffer *buffer;
4665 unsigned long events;
4666 unsigned long bytes_written;
4667 unsigned long bytes_alloc;
4668 unsigned long bytes_dropped;
4669 unsigned long events_nested;
4670 unsigned long bytes_written_nested;
4671 unsigned long bytes_alloc_nested;
4672 unsigned long bytes_dropped_nested;
4673 int min_size_nested;
4674 int max_size_nested;
4675 int max_size;
4676 int min_size;
4677 int cpu;
4678 int cnt;
4679};
4680
4681static struct rb_test_data rb_data[NR_CPUS] __initdata;
4682
4683/* 1 meg per cpu */
4684#define RB_TEST_BUFFER_SIZE 1048576
4685
4686static char rb_string[] __initdata =
4687 "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
4688 "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
4689 "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
4690
4691static bool rb_test_started __initdata;
4692
4693struct rb_item {
4694 int size;
4695 char str[];
4696};
4697
4698static __init int rb_write_something(struct rb_test_data *data, bool nested)
4699{
4700 struct ring_buffer_event *event;
4701 struct rb_item *item;
4702 bool started;
4703 int event_len;
4704 int size;
4705 int len;
4706 int cnt;
4707
4708 /* Have nested writes different that what is written */
4709 cnt = data->cnt + (nested ? 27 : 0);
4710
4711 /* Multiply cnt by ~e, to make some unique increment */
4712 size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
4713
4714 len = size + sizeof(struct rb_item);
4715
4716 started = rb_test_started;
4717 /* read rb_test_started before checking buffer enabled */
4718 smp_rmb();
4719
4720 event = ring_buffer_lock_reserve(data->buffer, len);
4721 if (!event) {
4722 /* Ignore dropped events before test starts. */
4723 if (started) {
4724 if (nested)
4725 data->bytes_dropped += len;
4726 else
4727 data->bytes_dropped_nested += len;
4728 }
4729 return len;
4730 }
4731
4732 event_len = ring_buffer_event_length(event);
4733
4734 if (RB_WARN_ON(data->buffer, event_len < len))
4735 goto out;
4736
4737 item = ring_buffer_event_data(event);
4738 item->size = size;
4739 memcpy(item->str, rb_string, size);
4740
4741 if (nested) {
4742 data->bytes_alloc_nested += event_len;
4743 data->bytes_written_nested += len;
4744 data->events_nested++;
4745 if (!data->min_size_nested || len < data->min_size_nested)
4746 data->min_size_nested = len;
4747 if (len > data->max_size_nested)
4748 data->max_size_nested = len;
4749 } else {
4750 data->bytes_alloc += event_len;
4751 data->bytes_written += len;
4752 data->events++;
4753 if (!data->min_size || len < data->min_size)
4754 data->max_size = len;
4755 if (len > data->max_size)
4756 data->max_size = len;
4757 }
4758
4759 out:
4760 ring_buffer_unlock_commit(data->buffer, event);
4761
4762 return 0;
4763}
4764
4765static __init int rb_test(void *arg)
4766{
4767 struct rb_test_data *data = arg;
4768
4769 while (!kthread_should_stop()) {
4770 rb_write_something(data, false);
4771 data->cnt++;
4772
4773 set_current_state(TASK_INTERRUPTIBLE);
4774 /* Now sleep between a min of 100-300us and a max of 1ms */
4775 usleep_range(((data->cnt % 3) + 1) * 100, 1000);
4776 }
4777
4778 return 0;
4779}
4780
4781static __init void rb_ipi(void *ignore)
4782{
4783 struct rb_test_data *data;
4784 int cpu = smp_processor_id();
4785
4786 data = &rb_data[cpu];
4787 rb_write_something(data, true);
4788}
4789
4790static __init int rb_hammer_test(void *arg)
4791{
4792 while (!kthread_should_stop()) {
4793
4794 /* Send an IPI to all cpus to write data! */
4795 smp_call_function(rb_ipi, NULL, 1);
4796 /* No sleep, but for non preempt, let others run */
4797 schedule();
4798 }
4799
4800 return 0;
4801}
4802
4803static __init int test_ringbuffer(void)
4804{
4805 struct task_struct *rb_hammer;
4806 struct ring_buffer *buffer;
4807 int cpu;
4808 int ret = 0;
4809
4810 pr_info("Running ring buffer tests...\n");
4811
4812 buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
4813 if (WARN_ON(!buffer))
4814 return 0;
4815
4816 /* Disable buffer so that threads can't write to it yet */
4817 ring_buffer_record_off(buffer);
4818
4819 for_each_online_cpu(cpu) {
4820 rb_data[cpu].buffer = buffer;
4821 rb_data[cpu].cpu = cpu;
4822 rb_data[cpu].cnt = cpu;
4823 rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
4824 "rbtester/%d", cpu);
4825 if (WARN_ON(!rb_threads[cpu])) {
4826 pr_cont("FAILED\n");
4827 ret = -1;
4828 goto out_free;
4829 }
4830
4831 kthread_bind(rb_threads[cpu], cpu);
4832 wake_up_process(rb_threads[cpu]);
4833 }
4834
4835 /* Now create the rb hammer! */
4836 rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
4837 if (WARN_ON(!rb_hammer)) {
4838 pr_cont("FAILED\n");
4839 ret = -1;
4840 goto out_free;
4841 }
4842
4843 ring_buffer_record_on(buffer);
4844 /*
4845 * Show buffer is enabled before setting rb_test_started.
4846 * Yes there's a small race window where events could be
4847 * dropped and the thread wont catch it. But when a ring
4848 * buffer gets enabled, there will always be some kind of
4849 * delay before other CPUs see it. Thus, we don't care about
4850 * those dropped events. We care about events dropped after
4851 * the threads see that the buffer is active.
4852 */
4853 smp_wmb();
4854 rb_test_started = true;
4855
4856 set_current_state(TASK_INTERRUPTIBLE);
4857 /* Just run for 10 seconds */;
4858 schedule_timeout(10 * HZ);
4859
4860 kthread_stop(rb_hammer);
4861
4862 out_free:
4863 for_each_online_cpu(cpu) {
4864 if (!rb_threads[cpu])
4865 break;
4866 kthread_stop(rb_threads[cpu]);
4867 }
4868 if (ret) {
4869 ring_buffer_free(buffer);
4870 return ret;
4871 }
4872
4873 /* Report! */
4874 pr_info("finished\n");
4875 for_each_online_cpu(cpu) {
4876 struct ring_buffer_event *event;
4877 struct rb_test_data *data = &rb_data[cpu];
4878 struct rb_item *item;
4879 unsigned long total_events;
4880 unsigned long total_dropped;
4881 unsigned long total_written;
4882 unsigned long total_alloc;
4883 unsigned long total_read = 0;
4884 unsigned long total_size = 0;
4885 unsigned long total_len = 0;
4886 unsigned long total_lost = 0;
4887 unsigned long lost;
4888 int big_event_size;
4889 int small_event_size;
4890
4891 ret = -1;
4892
4893 total_events = data->events + data->events_nested;
4894 total_written = data->bytes_written + data->bytes_written_nested;
4895 total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
4896 total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
4897
4898 big_event_size = data->max_size + data->max_size_nested;
4899 small_event_size = data->min_size + data->min_size_nested;
4900
4901 pr_info("CPU %d:\n", cpu);
4902 pr_info(" events: %ld\n", total_events);
4903 pr_info(" dropped bytes: %ld\n", total_dropped);
4904 pr_info(" alloced bytes: %ld\n", total_alloc);
4905 pr_info(" written bytes: %ld\n", total_written);
4906 pr_info(" biggest event: %d\n", big_event_size);
4907 pr_info(" smallest event: %d\n", small_event_size);
4908
4909 if (RB_WARN_ON(buffer, total_dropped))
4910 break;
4911
4912 ret = 0;
4913
4914 while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
4915 total_lost += lost;
4916 item = ring_buffer_event_data(event);
4917 total_len += ring_buffer_event_length(event);
4918 total_size += item->size + sizeof(struct rb_item);
4919 if (memcmp(&item->str[0], rb_string, item->size) != 0) {
4920 pr_info("FAILED!\n");
4921 pr_info("buffer had: %.*s\n", item->size, item->str);
4922 pr_info("expected: %.*s\n", item->size, rb_string);
4923 RB_WARN_ON(buffer, 1);
4924 ret = -1;
4925 break;
4926 }
4927 total_read++;
4928 }
4929 if (ret)
4930 break;
4931
4932 ret = -1;
4933
4934 pr_info(" read events: %ld\n", total_read);
4935 pr_info(" lost events: %ld\n", total_lost);
4936 pr_info(" total events: %ld\n", total_lost + total_read);
4937 pr_info(" recorded len bytes: %ld\n", total_len);
4938 pr_info(" recorded size bytes: %ld\n", total_size);
4939 if (total_lost)
4940 pr_info(" With dropped events, record len and size may not match\n"
4941 " alloced and written from above\n");
4942 if (!total_lost) {
4943 if (RB_WARN_ON(buffer, total_len != total_alloc ||
4944 total_size != total_written))
4945 break;
4946 }
4947 if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
4948 break;
4949
4950 ret = 0;
4951 }
4952 if (!ret)
4953 pr_info("Ring buffer PASSED!\n");
4954
4955 ring_buffer_free(buffer);
4956 return 0;
4957}
4958
4959late_initcall(test_ringbuffer);
4960#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1f835a83cb2c..1a41023a1f88 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * ring buffer based function tracer 2 * ring buffer based function tracer
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 * 6 *
7 * Originally taken from the RT patch by: 7 * Originally taken from the RT patch by:
@@ -19,7 +19,6 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/hardirq.h> 24#include <linux/hardirq.h>
@@ -48,7 +47,7 @@
48 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
49 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
50 */ 49 */
51int ring_buffer_expanded; 50bool ring_buffer_expanded;
52 51
53/* 52/*
54 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
87static DEFINE_PER_CPU(bool, trace_cmdline_save); 86static DEFINE_PER_CPU(bool, trace_cmdline_save);
88 87
89/* 88/*
90 * When a reader is waiting for data, then this variable is
91 * set to true.
92 */
93static bool trace_wakeup_needed;
94
95static struct irq_work trace_work_wakeup;
96
97/*
98 * Kill all tracing for good (never come back). 89 * Kill all tracing for good (never come back).
99 * It is initialized to 1 but will turn to zero if the initialization 90 * It is initialized to 1 but will turn to zero if the initialization
100 * of the tracer is successful. But that is the only place that sets 91 * of the tracer is successful. But that is the only place that sets
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
130static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 121static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
131static char *default_bootup_tracer; 122static char *default_bootup_tracer;
132 123
124static bool allocate_snapshot;
125
133static int __init set_cmdline_ftrace(char *str) 126static int __init set_cmdline_ftrace(char *str)
134{ 127{
135 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 128 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
136 default_bootup_tracer = bootup_tracer_buf; 129 default_bootup_tracer = bootup_tracer_buf;
137 /* We are using ftrace early, expand it */ 130 /* We are using ftrace early, expand it */
138 ring_buffer_expanded = 1; 131 ring_buffer_expanded = true;
139 return 1; 132 return 1;
140} 133}
141__setup("ftrace=", set_cmdline_ftrace); 134__setup("ftrace=", set_cmdline_ftrace);
@@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)
156} 149}
157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
158 151
152static int __init boot_alloc_snapshot(char *str)
153{
154 allocate_snapshot = true;
155 /* We also need the main ring buffer expanded */
156 ring_buffer_expanded = true;
157 return 1;
158}
159__setup("alloc_snapshot", boot_alloc_snapshot);
160
159 161
160static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; 162static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
161static char *trace_boot_options __initdata; 163static char *trace_boot_options __initdata;
162 164
163static int __init set_trace_boot_options(char *str) 165static int __init set_trace_boot_options(char *str)
164{ 166{
165 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); 167 strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
166 trace_boot_options = trace_boot_options_buf; 168 trace_boot_options = trace_boot_options_buf;
167 return 0; 169 return 0;
168} 170}
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)
189 */ 191 */
190static struct trace_array global_trace; 192static struct trace_array global_trace;
191 193
192static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 194LIST_HEAD(ftrace_trace_arrays);
193 195
194int filter_current_check_discard(struct ring_buffer *buffer, 196int filter_current_check_discard(struct ring_buffer *buffer,
195 struct ftrace_event_call *call, void *rec, 197 struct ftrace_event_call *call, void *rec,
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)
204 u64 ts; 206 u64 ts;
205 207
206 /* Early boot up does not have a buffer yet */ 208 /* Early boot up does not have a buffer yet */
207 if (!global_trace.buffer) 209 if (!global_trace.trace_buffer.buffer)
208 return trace_clock_local(); 210 return trace_clock_local();
209 211
210 ts = ring_buffer_time_stamp(global_trace.buffer, cpu); 212 ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
211 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); 213 ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
212 214
213 return ts; 215 return ts;
214} 216}
215 217
216/*
217 * The max_tr is used to snapshot the global_trace when a maximum
218 * latency is reached. Some tracers will use this to store a maximum
219 * trace while it continues examining live traces.
220 *
221 * The buffers for the max_tr are set up the same as the global_trace.
222 * When a snapshot is taken, the link list of the max_tr is swapped
223 * with the link list of the global_trace and the buffers are reset for
224 * the global_trace so the tracing can continue.
225 */
226static struct trace_array max_tr;
227
228static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
229
230int tracing_is_enabled(void) 218int tracing_is_enabled(void)
231{ 219{
232 return tracing_is_on(); 220 return tracing_is_on();
@@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249/* trace_types holds a link list of available tracers. */ 237/* trace_types holds a link list of available tracers. */
250static struct tracer *trace_types __read_mostly; 238static struct tracer *trace_types __read_mostly;
251 239
252/* current_trace points to the tracer that is currently active */
253static struct tracer *current_trace __read_mostly = &nop_trace;
254
255/* 240/*
256 * trace_types_lock is used to protect the trace_types list. 241 * trace_types_lock is used to protect the trace_types list.
257 */ 242 */
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
285 270
286static inline void trace_access_lock(int cpu) 271static inline void trace_access_lock(int cpu)
287{ 272{
288 if (cpu == TRACE_PIPE_ALL_CPU) { 273 if (cpu == RING_BUFFER_ALL_CPUS) {
289 /* gain it for accessing the whole ring buffer. */ 274 /* gain it for accessing the whole ring buffer. */
290 down_write(&all_cpu_access_lock); 275 down_write(&all_cpu_access_lock);
291 } else { 276 } else {
292 /* gain it for accessing a cpu ring buffer. */ 277 /* gain it for accessing a cpu ring buffer. */
293 278
294 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ 279 /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
295 down_read(&all_cpu_access_lock); 280 down_read(&all_cpu_access_lock);
296 281
297 /* Secondly block other access to this @cpu ring buffer. */ 282 /* Secondly block other access to this @cpu ring buffer. */
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)
301 286
302static inline void trace_access_unlock(int cpu) 287static inline void trace_access_unlock(int cpu)
303{ 288{
304 if (cpu == TRACE_PIPE_ALL_CPU) { 289 if (cpu == RING_BUFFER_ALL_CPUS) {
305 up_write(&all_cpu_access_lock); 290 up_write(&all_cpu_access_lock);
306 } else { 291 } else {
307 mutex_unlock(&per_cpu(cpu_access_lock, cpu)); 292 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)
339 324
340#endif 325#endif
341 326
342/* trace_wait is a waitqueue for tasks blocked on trace_poll */
343static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344
345/* trace_flags holds trace_options default values */ 327/* trace_flags holds trace_options default values */
346unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 328unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
347 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 329 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
348 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
349 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; 331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
350
351static int trace_stop_count;
352static DEFINE_RAW_SPINLOCK(tracing_start_lock);
353
354/**
355 * trace_wake_up - wake up tasks waiting for trace input
356 *
357 * Schedules a delayed work to wake up any task that is blocked on the
358 * trace_wait queue. These is used with trace_poll for tasks polling the
359 * trace.
360 */
361static void trace_wake_up(struct irq_work *work)
362{
363 wake_up_all(&trace_wait);
364
365}
366 332
367/** 333/**
368 * tracing_on - enable tracing buffers 334 * tracing_on - enable tracing buffers
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)
372 */ 338 */
373void tracing_on(void) 339void tracing_on(void)
374{ 340{
375 if (global_trace.buffer) 341 if (global_trace.trace_buffer.buffer)
376 ring_buffer_record_on(global_trace.buffer); 342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
377 /* 343 /*
378 * This flag is only looked at when buffers haven't been 344 * This flag is only looked at when buffers haven't been
379 * allocated yet. We don't really care about the race 345 * allocated yet. We don't really care about the race
@@ -385,6 +351,196 @@ void tracing_on(void)
385EXPORT_SYMBOL_GPL(tracing_on); 351EXPORT_SYMBOL_GPL(tracing_on);
386 352
387/** 353/**
354 * __trace_puts - write a constant string into the trace buffer.
355 * @ip: The address of the caller
356 * @str: The constant string to write
357 * @size: The size of the string.
358 */
359int __trace_puts(unsigned long ip, const char *str, int size)
360{
361 struct ring_buffer_event *event;
362 struct ring_buffer *buffer;
363 struct print_entry *entry;
364 unsigned long irq_flags;
365 int alloc;
366
367 alloc = sizeof(*entry) + size + 2; /* possible \n added */
368
369 local_save_flags(irq_flags);
370 buffer = global_trace.trace_buffer.buffer;
371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
372 irq_flags, preempt_count());
373 if (!event)
374 return 0;
375
376 entry = ring_buffer_event_data(event);
377 entry->ip = ip;
378
379 memcpy(&entry->buf, str, size);
380
381 /* Add a newline if necessary */
382 if (entry->buf[size - 1] != '\n') {
383 entry->buf[size] = '\n';
384 entry->buf[size + 1] = '\0';
385 } else
386 entry->buf[size] = '\0';
387
388 __buffer_unlock_commit(buffer, event);
389
390 return size;
391}
392EXPORT_SYMBOL_GPL(__trace_puts);
393
394/**
395 * __trace_bputs - write the pointer to a constant string into trace buffer
396 * @ip: The address of the caller
397 * @str: The constant string to write to the buffer to
398 */
399int __trace_bputs(unsigned long ip, const char *str)
400{
401 struct ring_buffer_event *event;
402 struct ring_buffer *buffer;
403 struct bputs_entry *entry;
404 unsigned long irq_flags;
405 int size = sizeof(struct bputs_entry);
406
407 local_save_flags(irq_flags);
408 buffer = global_trace.trace_buffer.buffer;
409 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
410 irq_flags, preempt_count());
411 if (!event)
412 return 0;
413
414 entry = ring_buffer_event_data(event);
415 entry->ip = ip;
416 entry->str = str;
417
418 __buffer_unlock_commit(buffer, event);
419
420 return 1;
421}
422EXPORT_SYMBOL_GPL(__trace_bputs);
423
424#ifdef CONFIG_TRACER_SNAPSHOT
425/**
426 * trace_snapshot - take a snapshot of the current buffer.
427 *
428 * This causes a swap between the snapshot buffer and the current live
429 * tracing buffer. You can use this to take snapshots of the live
430 * trace when some condition is triggered, but continue to trace.
431 *
432 * Note, make sure to allocate the snapshot with either
433 * a tracing_snapshot_alloc(), or by doing it manually
434 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
435 *
436 * If the snapshot buffer is not allocated, it will stop tracing.
437 * Basically making a permanent snapshot.
438 */
439void tracing_snapshot(void)
440{
441 struct trace_array *tr = &global_trace;
442 struct tracer *tracer = tr->current_trace;
443 unsigned long flags;
444
445 if (in_nmi()) {
446 internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
447 internal_trace_puts("*** snapshot is being ignored ***\n");
448 return;
449 }
450
451 if (!tr->allocated_snapshot) {
452 internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
453 internal_trace_puts("*** stopping trace here! ***\n");
454 tracing_off();
455 return;
456 }
457
458 /* Note, snapshot can not be used when the tracer uses it */
459 if (tracer->use_max_tr) {
460 internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
461 internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
462 return;
463 }
464
465 local_irq_save(flags);
466 update_max_tr(tr, current, smp_processor_id());
467 local_irq_restore(flags);
468}
469EXPORT_SYMBOL_GPL(tracing_snapshot);
470
471static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
472 struct trace_buffer *size_buf, int cpu_id);
473static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
474
475static int alloc_snapshot(struct trace_array *tr)
476{
477 int ret;
478
479 if (!tr->allocated_snapshot) {
480
481 /* allocate spare buffer */
482 ret = resize_buffer_duplicate_size(&tr->max_buffer,
483 &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
484 if (ret < 0)
485 return ret;
486
487 tr->allocated_snapshot = true;
488 }
489
490 return 0;
491}
492
493void free_snapshot(struct trace_array *tr)
494{
495 /*
496 * We don't free the ring buffer. instead, resize it because
497 * The max_tr ring buffer has some state (e.g. ring->clock) and
498 * we want preserve it.
499 */
500 ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
501 set_buffer_entries(&tr->max_buffer, 1);
502 tracing_reset_online_cpus(&tr->max_buffer);
503 tr->allocated_snapshot = false;
504}
505
506/**
507 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
508 *
509 * This is similar to trace_snapshot(), but it will allocate the
510 * snapshot buffer if it isn't already allocated. Use this only
511 * where it is safe to sleep, as the allocation may sleep.
512 *
513 * This causes a swap between the snapshot buffer and the current live
514 * tracing buffer. You can use this to take snapshots of the live
515 * trace when some condition is triggered, but continue to trace.
516 */
517void tracing_snapshot_alloc(void)
518{
519 struct trace_array *tr = &global_trace;
520 int ret;
521
522 ret = alloc_snapshot(tr);
523 if (WARN_ON(ret < 0))
524 return;
525
526 tracing_snapshot();
527}
528EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
529#else
530void tracing_snapshot(void)
531{
532 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
533}
534EXPORT_SYMBOL_GPL(tracing_snapshot);
535void tracing_snapshot_alloc(void)
536{
537 /* Give warning */
538 tracing_snapshot();
539}
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */
542
543/**
388 * tracing_off - turn off tracing buffers 544 * tracing_off - turn off tracing buffers
389 * 545 *
390 * This function stops the tracing buffers from recording data. 546 * This function stops the tracing buffers from recording data.
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
394 */ 550 */
395void tracing_off(void) 551void tracing_off(void)
396{ 552{
397 if (global_trace.buffer) 553 if (global_trace.trace_buffer.buffer)
398 ring_buffer_record_off(global_trace.buffer); 554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
399 /* 555 /*
400 * This flag is only looked at when buffers haven't been 556 * This flag is only looked at when buffers haven't been
401 * allocated yet. We don't really care about the race 557 * allocated yet. We don't really care about the race
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
411 */ 567 */
412int tracing_is_on(void) 568int tracing_is_on(void)
413{ 569{
414 if (global_trace.buffer) 570 if (global_trace.trace_buffer.buffer)
415 return ring_buffer_record_is_on(global_trace.buffer); 571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
416 return !global_trace.buffer_disabled; 572 return !global_trace.buffer_disabled;
417} 573}
418EXPORT_SYMBOL_GPL(tracing_is_on); 574EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -479,6 +635,7 @@ static const char *trace_options[] = {
479 "disable_on_free", 635 "disable_on_free",
480 "irq-info", 636 "irq-info",
481 "markers", 637 "markers",
638 "function-trace",
482 NULL 639 NULL
483}; 640};
484 641
@@ -490,6 +647,8 @@ static struct {
490 { trace_clock_local, "local", 1 }, 647 { trace_clock_local, "local", 1 },
491 { trace_clock_global, "global", 1 }, 648 { trace_clock_global, "global", 1 },
492 { trace_clock_counter, "counter", 0 }, 649 { trace_clock_counter, "counter", 0 },
650 { trace_clock_jiffies, "uptime", 1 },
651 { trace_clock, "perf", 1 },
493 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
494}; 653};
495 654
@@ -670,20 +829,29 @@ unsigned long __read_mostly tracing_max_latency;
670static void 829static void
671__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 830__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
672{ 831{
673 struct trace_array_cpu *data = tr->data[cpu]; 832 struct trace_buffer *trace_buf = &tr->trace_buffer;
674 struct trace_array_cpu *max_data; 833 struct trace_buffer *max_buf = &tr->max_buffer;
834 struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
835 struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
675 836
676 max_tr.cpu = cpu; 837 max_buf->cpu = cpu;
677 max_tr.time_start = data->preempt_timestamp; 838 max_buf->time_start = data->preempt_timestamp;
678 839
679 max_data = max_tr.data[cpu];
680 max_data->saved_latency = tracing_max_latency; 840 max_data->saved_latency = tracing_max_latency;
681 max_data->critical_start = data->critical_start; 841 max_data->critical_start = data->critical_start;
682 max_data->critical_end = data->critical_end; 842 max_data->critical_end = data->critical_end;
683 843
684 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); 844 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
685 max_data->pid = tsk->pid; 845 max_data->pid = tsk->pid;
686 max_data->uid = task_uid(tsk); 846 /*
847 * If tsk == current, then use current_uid(), as that does not use
848 * RCU. The irq tracer can be called out of RCU scope.
849 */
850 if (tsk == current)
851 max_data->uid = current_uid();
852 else
853 max_data->uid = task_uid(tsk);
854
687 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 855 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
688 max_data->policy = tsk->policy; 856 max_data->policy = tsk->policy;
689 max_data->rt_priority = tsk->rt_priority; 857 max_data->rt_priority = tsk->rt_priority;
@@ -704,23 +872,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
704void 872void
705update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 873update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
706{ 874{
707 struct ring_buffer *buf = tr->buffer; 875 struct ring_buffer *buf;
708 876
709 if (trace_stop_count) 877 if (tr->stop_count)
710 return; 878 return;
711 879
712 WARN_ON_ONCE(!irqs_disabled()); 880 WARN_ON_ONCE(!irqs_disabled());
713 881
714 if (!current_trace->allocated_snapshot) { 882 if (!tr->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */ 883 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace); 884 WARN_ON_ONCE(tr->current_trace != &nop_trace);
717 return; 885 return;
718 } 886 }
719 887
720 arch_spin_lock(&ftrace_max_lock); 888 arch_spin_lock(&ftrace_max_lock);
721 889
722 tr->buffer = max_tr.buffer; 890 buf = tr->trace_buffer.buffer;
723 max_tr.buffer = buf; 891 tr->trace_buffer.buffer = tr->max_buffer.buffer;
892 tr->max_buffer.buffer = buf;
724 893
725 __update_max_tr(tr, tsk, cpu); 894 __update_max_tr(tr, tsk, cpu);
726 arch_spin_unlock(&ftrace_max_lock); 895 arch_spin_unlock(&ftrace_max_lock);
@@ -739,16 +908,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
739{ 908{
740 int ret; 909 int ret;
741 910
742 if (trace_stop_count) 911 if (tr->stop_count)
743 return; 912 return;
744 913
745 WARN_ON_ONCE(!irqs_disabled()); 914 WARN_ON_ONCE(!irqs_disabled());
746 if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) 915 if (!tr->allocated_snapshot) {
916 /* Only the nop tracer should hit this when disabling */
917 WARN_ON_ONCE(tr->current_trace != &nop_trace);
747 return; 918 return;
919 }
748 920
749 arch_spin_lock(&ftrace_max_lock); 921 arch_spin_lock(&ftrace_max_lock);
750 922
751 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 923 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
752 924
753 if (ret == -EBUSY) { 925 if (ret == -EBUSY) {
754 /* 926 /*
@@ -757,7 +929,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
757 * the max trace buffer (no one writes directly to it) 929 * the max trace buffer (no one writes directly to it)
758 * and flag that it failed. 930 * and flag that it failed.
759 */ 931 */
760 trace_array_printk(&max_tr, _THIS_IP_, 932 trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
761 "Failed to swap buffers due to commit in progress\n"); 933 "Failed to swap buffers due to commit in progress\n");
762 } 934 }
763 935
@@ -770,37 +942,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
770 942
771static void default_wait_pipe(struct trace_iterator *iter) 943static void default_wait_pipe(struct trace_iterator *iter)
772{ 944{
773 DEFINE_WAIT(wait); 945 /* Iterators are static, they should be filled or empty */
946 if (trace_buffer_iter(iter, iter->cpu_file))
947 return;
948
949 ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
950}
951
952#ifdef CONFIG_FTRACE_STARTUP_TEST
953static int run_tracer_selftest(struct tracer *type)
954{
955 struct trace_array *tr = &global_trace;
956 struct tracer *saved_tracer = tr->current_trace;
957 int ret;
774 958
775 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); 959 if (!type->selftest || tracing_selftest_disabled)
960 return 0;
776 961
777 /* 962 /*
778 * The events can happen in critical sections where 963 * Run a selftest on this tracer.
779 * checking a work queue can cause deadlocks. 964 * Here we reset the trace buffer, and set the current
780 * After adding a task to the queue, this flag is set 965 * tracer to be this tracer. The tracer can then run some
781 * only to notify events to try to wake up the queue 966 * internal tracing to verify that everything is in order.
782 * using irq_work. 967 * If we fail, we do not register this tracer.
783 *
784 * We don't clear it even if the buffer is no longer
785 * empty. The flag only causes the next event to run
786 * irq_work to do the work queue wake up. The worse
787 * that can happen if we race with !trace_empty() is that
788 * an event will cause an irq_work to try to wake up
789 * an empty queue.
790 *
791 * There's no reason to protect this flag either, as
792 * the work queue and irq_work logic will do the necessary
793 * synchronization for the wake ups. The only thing
794 * that is necessary is that the wake up happens after
795 * a task has been queued. It's OK for spurious wake ups.
796 */ 968 */
797 trace_wakeup_needed = true; 969 tracing_reset_online_cpus(&tr->trace_buffer);
798 970
799 if (trace_empty(iter)) 971 tr->current_trace = type;
800 schedule(); 972
973#ifdef CONFIG_TRACER_MAX_TRACE
974 if (type->use_max_tr) {
975 /* If we expanded the buffers, make sure the max is expanded too */
976 if (ring_buffer_expanded)
977 ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
978 RING_BUFFER_ALL_CPUS);
979 tr->allocated_snapshot = true;
980 }
981#endif
982
983 /* the test is responsible for initializing and enabling */
984 pr_info("Testing tracer %s: ", type->name);
985 ret = type->selftest(type, tr);
986 /* the test is responsible for resetting too */
987 tr->current_trace = saved_tracer;
988 if (ret) {
989 printk(KERN_CONT "FAILED!\n");
990 /* Add the warning after printing 'FAILED' */
991 WARN_ON(1);
992 return -1;
993 }
994 /* Only reset on passing, to avoid touching corrupted buffers */
995 tracing_reset_online_cpus(&tr->trace_buffer);
996
997#ifdef CONFIG_TRACER_MAX_TRACE
998 if (type->use_max_tr) {
999 tr->allocated_snapshot = false;
1000
1001 /* Shrink the max buffer again */
1002 if (ring_buffer_expanded)
1003 ring_buffer_resize(tr->max_buffer.buffer, 1,
1004 RING_BUFFER_ALL_CPUS);
1005 }
1006#endif
801 1007
802 finish_wait(&trace_wait, &wait); 1008 printk(KERN_CONT "PASSED\n");
1009 return 0;
803} 1010}
1011#else
1012static inline int run_tracer_selftest(struct tracer *type)
1013{
1014 return 0;
1015}
1016#endif /* CONFIG_FTRACE_STARTUP_TEST */
804 1017
805/** 1018/**
806 * register_tracer - register a tracer with the ftrace system. 1019 * register_tracer - register a tracer with the ftrace system.
@@ -847,57 +1060,9 @@ int register_tracer(struct tracer *type)
847 if (!type->wait_pipe) 1060 if (!type->wait_pipe)
848 type->wait_pipe = default_wait_pipe; 1061 type->wait_pipe = default_wait_pipe;
849 1062
850 1063 ret = run_tracer_selftest(type);
851#ifdef CONFIG_FTRACE_STARTUP_TEST 1064 if (ret < 0)
852 if (type->selftest && !tracing_selftest_disabled) { 1065 goto out;
853 struct tracer *saved_tracer = current_trace;
854 struct trace_array *tr = &global_trace;
855
856 /*
857 * Run a selftest on this tracer.
858 * Here we reset the trace buffer, and set the current
859 * tracer to be this tracer. The tracer can then run some
860 * internal tracing to verify that everything is in order.
861 * If we fail, we do not register this tracer.
862 */
863 tracing_reset_online_cpus(tr);
864
865 current_trace = type;
866
867 if (type->use_max_tr) {
868 /* If we expanded the buffers, make sure the max is expanded too */
869 if (ring_buffer_expanded)
870 ring_buffer_resize(max_tr.buffer, trace_buf_size,
871 RING_BUFFER_ALL_CPUS);
872 type->allocated_snapshot = true;
873 }
874
875 /* the test is responsible for initializing and enabling */
876 pr_info("Testing tracer %s: ", type->name);
877 ret = type->selftest(type, tr);
878 /* the test is responsible for resetting too */
879 current_trace = saved_tracer;
880 if (ret) {
881 printk(KERN_CONT "FAILED!\n");
882 /* Add the warning after printing 'FAILED' */
883 WARN_ON(1);
884 goto out;
885 }
886 /* Only reset on passing, to avoid touching corrupted buffers */
887 tracing_reset_online_cpus(tr);
888
889 if (type->use_max_tr) {
890 type->allocated_snapshot = false;
891
892 /* Shrink the max buffer again */
893 if (ring_buffer_expanded)
894 ring_buffer_resize(max_tr.buffer, 1,
895 RING_BUFFER_ALL_CPUS);
896 }
897
898 printk(KERN_CONT "PASSED\n");
899 }
900#endif
901 1066
902 type->next = trace_types; 1067 type->next = trace_types;
903 trace_types = type; 1068 trace_types = type;
@@ -917,7 +1082,7 @@ int register_tracer(struct tracer *type)
917 tracing_set_tracer(type->name); 1082 tracing_set_tracer(type->name);
918 default_bootup_tracer = NULL; 1083 default_bootup_tracer = NULL;
919 /* disable other selftests, since this will break it. */ 1084 /* disable other selftests, since this will break it. */
920 tracing_selftest_disabled = 1; 1085 tracing_selftest_disabled = true;
921#ifdef CONFIG_FTRACE_STARTUP_TEST 1086#ifdef CONFIG_FTRACE_STARTUP_TEST
922 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", 1087 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
923 type->name); 1088 type->name);
@@ -927,9 +1092,9 @@ int register_tracer(struct tracer *type)
927 return ret; 1092 return ret;
928} 1093}
929 1094
930void tracing_reset(struct trace_array *tr, int cpu) 1095void tracing_reset(struct trace_buffer *buf, int cpu)
931{ 1096{
932 struct ring_buffer *buffer = tr->buffer; 1097 struct ring_buffer *buffer = buf->buffer;
933 1098
934 if (!buffer) 1099 if (!buffer)
935 return; 1100 return;
@@ -943,9 +1108,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
943 ring_buffer_record_enable(buffer); 1108 ring_buffer_record_enable(buffer);
944} 1109}
945 1110
946void tracing_reset_online_cpus(struct trace_array *tr) 1111void tracing_reset_online_cpus(struct trace_buffer *buf)
947{ 1112{
948 struct ring_buffer *buffer = tr->buffer; 1113 struct ring_buffer *buffer = buf->buffer;
949 int cpu; 1114 int cpu;
950 1115
951 if (!buffer) 1116 if (!buffer)
@@ -956,7 +1121,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
956 /* Make sure all commits have finished */ 1121 /* Make sure all commits have finished */
957 synchronize_sched(); 1122 synchronize_sched();
958 1123
959 tr->time_start = ftrace_now(tr->cpu); 1124 buf->time_start = ftrace_now(buf->cpu);
960 1125
961 for_each_online_cpu(cpu) 1126 for_each_online_cpu(cpu)
962 ring_buffer_reset_cpu(buffer, cpu); 1127 ring_buffer_reset_cpu(buffer, cpu);
@@ -966,12 +1131,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)
966 1131
967void tracing_reset_current(int cpu) 1132void tracing_reset_current(int cpu)
968{ 1133{
969 tracing_reset(&global_trace, cpu); 1134 tracing_reset(&global_trace.trace_buffer, cpu);
970} 1135}
971 1136
972void tracing_reset_current_online_cpus(void) 1137void tracing_reset_all_online_cpus(void)
973{ 1138{
974 tracing_reset_online_cpus(&global_trace); 1139 struct trace_array *tr;
1140
1141 mutex_lock(&trace_types_lock);
1142 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
1143 tracing_reset_online_cpus(&tr->trace_buffer);
1144#ifdef CONFIG_TRACER_MAX_TRACE
1145 tracing_reset_online_cpus(&tr->max_buffer);
1146#endif
1147 }
1148 mutex_unlock(&trace_types_lock);
975} 1149}
976 1150
977#define SAVED_CMDLINES 128 1151#define SAVED_CMDLINES 128
@@ -994,7 +1168,7 @@ static void trace_init_cmdlines(void)
994 1168
995int is_tracing_stopped(void) 1169int is_tracing_stopped(void)
996{ 1170{
997 return trace_stop_count; 1171 return global_trace.stop_count;
998} 1172}
999 1173
1000/** 1174/**
@@ -1026,12 +1200,12 @@ void tracing_start(void)
1026 if (tracing_disabled) 1200 if (tracing_disabled)
1027 return; 1201 return;
1028 1202
1029 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1203 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1030 if (--trace_stop_count) { 1204 if (--global_trace.stop_count) {
1031 if (trace_stop_count < 0) { 1205 if (global_trace.stop_count < 0) {
1032 /* Someone screwed up their debugging */ 1206 /* Someone screwed up their debugging */
1033 WARN_ON_ONCE(1); 1207 WARN_ON_ONCE(1);
1034 trace_stop_count = 0; 1208 global_trace.stop_count = 0;
1035 } 1209 }
1036 goto out; 1210 goto out;
1037 } 1211 }
@@ -1039,19 +1213,52 @@ void tracing_start(void)
1039 /* Prevent the buffers from switching */ 1213 /* Prevent the buffers from switching */
1040 arch_spin_lock(&ftrace_max_lock); 1214 arch_spin_lock(&ftrace_max_lock);
1041 1215
1042 buffer = global_trace.buffer; 1216 buffer = global_trace.trace_buffer.buffer;
1043 if (buffer) 1217 if (buffer)
1044 ring_buffer_record_enable(buffer); 1218 ring_buffer_record_enable(buffer);
1045 1219
1046 buffer = max_tr.buffer; 1220#ifdef CONFIG_TRACER_MAX_TRACE
1221 buffer = global_trace.max_buffer.buffer;
1047 if (buffer) 1222 if (buffer)
1048 ring_buffer_record_enable(buffer); 1223 ring_buffer_record_enable(buffer);
1224#endif
1049 1225
1050 arch_spin_unlock(&ftrace_max_lock); 1226 arch_spin_unlock(&ftrace_max_lock);
1051 1227
1052 ftrace_start(); 1228 ftrace_start();
1053 out: 1229 out:
1054 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1230 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1231}
1232
1233static void tracing_start_tr(struct trace_array *tr)
1234{
1235 struct ring_buffer *buffer;
1236 unsigned long flags;
1237
1238 if (tracing_disabled)
1239 return;
1240
1241 /* If global, we need to also start the max tracer */
1242 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1243 return tracing_start();
1244
1245 raw_spin_lock_irqsave(&tr->start_lock, flags);
1246
1247 if (--tr->stop_count) {
1248 if (tr->stop_count < 0) {
1249 /* Someone screwed up their debugging */
1250 WARN_ON_ONCE(1);
1251 tr->stop_count = 0;
1252 }
1253 goto out;
1254 }
1255
1256 buffer = tr->trace_buffer.buffer;
1257 if (buffer)
1258 ring_buffer_record_enable(buffer);
1259
1260 out:
1261 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1055} 1262}
1056 1263
1057/** 1264/**
@@ -1066,25 +1273,48 @@ void tracing_stop(void)
1066 unsigned long flags; 1273 unsigned long flags;
1067 1274
1068 ftrace_stop(); 1275 ftrace_stop();
1069 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1276 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1070 if (trace_stop_count++) 1277 if (global_trace.stop_count++)
1071 goto out; 1278 goto out;
1072 1279
1073 /* Prevent the buffers from switching */ 1280 /* Prevent the buffers from switching */
1074 arch_spin_lock(&ftrace_max_lock); 1281 arch_spin_lock(&ftrace_max_lock);
1075 1282
1076 buffer = global_trace.buffer; 1283 buffer = global_trace.trace_buffer.buffer;
1077 if (buffer) 1284 if (buffer)
1078 ring_buffer_record_disable(buffer); 1285 ring_buffer_record_disable(buffer);
1079 1286
1080 buffer = max_tr.buffer; 1287#ifdef CONFIG_TRACER_MAX_TRACE
1288 buffer = global_trace.max_buffer.buffer;
1081 if (buffer) 1289 if (buffer)
1082 ring_buffer_record_disable(buffer); 1290 ring_buffer_record_disable(buffer);
1291#endif
1083 1292
1084 arch_spin_unlock(&ftrace_max_lock); 1293 arch_spin_unlock(&ftrace_max_lock);
1085 1294
1086 out: 1295 out:
1087 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1296 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1297}
1298
1299static void tracing_stop_tr(struct trace_array *tr)
1300{
1301 struct ring_buffer *buffer;
1302 unsigned long flags;
1303
1304 /* If global, we need to also stop the max tracer */
1305 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1306 return tracing_stop();
1307
1308 raw_spin_lock_irqsave(&tr->start_lock, flags);
1309 if (tr->stop_count++)
1310 goto out;
1311
1312 buffer = tr->trace_buffer.buffer;
1313 if (buffer)
1314 ring_buffer_record_disable(buffer);
1315
1316 out:
1317 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1088} 1318}
1089 1319
1090void trace_stop_cmdline_recording(void); 1320void trace_stop_cmdline_recording(void);
@@ -1217,11 +1447,6 @@ void
1217__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) 1447__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1218{ 1448{
1219 __this_cpu_write(trace_cmdline_save, true); 1449 __this_cpu_write(trace_cmdline_save, true);
1220 if (trace_wakeup_needed) {
1221 trace_wakeup_needed = false;
1222 /* irq_work_queue() supplies it's own memory barriers */
1223 irq_work_queue(&trace_work_wakeup);
1224 }
1225 ring_buffer_unlock_commit(buffer, event); 1450 ring_buffer_unlock_commit(buffer, event);
1226} 1451}
1227 1452
@@ -1245,11 +1470,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1245EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); 1470EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1246 1471
1247struct ring_buffer_event * 1472struct ring_buffer_event *
1473trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
1474 struct ftrace_event_file *ftrace_file,
1475 int type, unsigned long len,
1476 unsigned long flags, int pc)
1477{
1478 *current_rb = ftrace_file->tr->trace_buffer.buffer;
1479 return trace_buffer_lock_reserve(*current_rb,
1480 type, len, flags, pc);
1481}
1482EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
1483
1484struct ring_buffer_event *
1248trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1485trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1249 int type, unsigned long len, 1486 int type, unsigned long len,
1250 unsigned long flags, int pc) 1487 unsigned long flags, int pc)
1251{ 1488{
1252 *current_rb = global_trace.buffer; 1489 *current_rb = global_trace.trace_buffer.buffer;
1253 return trace_buffer_lock_reserve(*current_rb, 1490 return trace_buffer_lock_reserve(*current_rb,
1254 type, len, flags, pc); 1491 type, len, flags, pc);
1255} 1492}
@@ -1288,7 +1525,7 @@ trace_function(struct trace_array *tr,
1288 int pc) 1525 int pc)
1289{ 1526{
1290 struct ftrace_event_call *call = &event_function; 1527 struct ftrace_event_call *call = &event_function;
1291 struct ring_buffer *buffer = tr->buffer; 1528 struct ring_buffer *buffer = tr->trace_buffer.buffer;
1292 struct ring_buffer_event *event; 1529 struct ring_buffer_event *event;
1293 struct ftrace_entry *entry; 1530 struct ftrace_entry *entry;
1294 1531
@@ -1429,13 +1666,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1429void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1666void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1430 int pc) 1667 int pc)
1431{ 1668{
1432 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); 1669 __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
1433} 1670}
1434 1671
1435/** 1672/**
1436 * trace_dump_stack - record a stack back trace in the trace buffer 1673 * trace_dump_stack - record a stack back trace in the trace buffer
1674 * @skip: Number of functions to skip (helper handlers)
1437 */ 1675 */
1438void trace_dump_stack(void) 1676void trace_dump_stack(int skip)
1439{ 1677{
1440 unsigned long flags; 1678 unsigned long flags;
1441 1679
@@ -1444,8 +1682,13 @@ void trace_dump_stack(void)
1444 1682
1445 local_save_flags(flags); 1683 local_save_flags(flags);
1446 1684
1447 /* skipping 3 traces, seems to get us at the caller of this function */ 1685 /*
1448 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); 1686 * Skip 3 more, seems to get us at the caller of
1687 * this function.
1688 */
1689 skip += 3;
1690 __ftrace_trace_stack(global_trace.trace_buffer.buffer,
1691 flags, skip, preempt_count(), NULL);
1449} 1692}
1450 1693
1451static DEFINE_PER_CPU(int, user_stack_count); 1694static DEFINE_PER_CPU(int, user_stack_count);
@@ -1615,7 +1858,7 @@ void trace_printk_init_buffers(void)
1615 * directly here. If the global_trace.buffer is already 1858 * directly here. If the global_trace.buffer is already
1616 * allocated here, then this was called by module code. 1859 * allocated here, then this was called by module code.
1617 */ 1860 */
1618 if (global_trace.buffer) 1861 if (global_trace.trace_buffer.buffer)
1619 tracing_start_cmdline_record(); 1862 tracing_start_cmdline_record();
1620} 1863}
1621 1864
@@ -1675,7 +1918,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1675 1918
1676 local_save_flags(flags); 1919 local_save_flags(flags);
1677 size = sizeof(*entry) + sizeof(u32) * len; 1920 size = sizeof(*entry) + sizeof(u32) * len;
1678 buffer = tr->buffer; 1921 buffer = tr->trace_buffer.buffer;
1679 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1922 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1680 flags, pc); 1923 flags, pc);
1681 if (!event) 1924 if (!event)
@@ -1698,27 +1941,12 @@ out:
1698} 1941}
1699EXPORT_SYMBOL_GPL(trace_vbprintk); 1942EXPORT_SYMBOL_GPL(trace_vbprintk);
1700 1943
1701int trace_array_printk(struct trace_array *tr, 1944static int
1702 unsigned long ip, const char *fmt, ...) 1945__trace_array_vprintk(struct ring_buffer *buffer,
1703{ 1946 unsigned long ip, const char *fmt, va_list args)
1704 int ret;
1705 va_list ap;
1706
1707 if (!(trace_flags & TRACE_ITER_PRINTK))
1708 return 0;
1709
1710 va_start(ap, fmt);
1711 ret = trace_array_vprintk(tr, ip, fmt, ap);
1712 va_end(ap);
1713 return ret;
1714}
1715
1716int trace_array_vprintk(struct trace_array *tr,
1717 unsigned long ip, const char *fmt, va_list args)
1718{ 1947{
1719 struct ftrace_event_call *call = &event_print; 1948 struct ftrace_event_call *call = &event_print;
1720 struct ring_buffer_event *event; 1949 struct ring_buffer_event *event;
1721 struct ring_buffer *buffer;
1722 int len = 0, size, pc; 1950 int len = 0, size, pc;
1723 struct print_entry *entry; 1951 struct print_entry *entry;
1724 unsigned long flags; 1952 unsigned long flags;
@@ -1746,7 +1974,6 @@ int trace_array_vprintk(struct trace_array *tr,
1746 1974
1747 local_save_flags(flags); 1975 local_save_flags(flags);
1748 size = sizeof(*entry) + len + 1; 1976 size = sizeof(*entry) + len + 1;
1749 buffer = tr->buffer;
1750 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1977 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1751 flags, pc); 1978 flags, pc);
1752 if (!event) 1979 if (!event)
@@ -1767,6 +1994,42 @@ int trace_array_vprintk(struct trace_array *tr,
1767 return len; 1994 return len;
1768} 1995}
1769 1996
1997int trace_array_vprintk(struct trace_array *tr,
1998 unsigned long ip, const char *fmt, va_list args)
1999{
2000 return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
2001}
2002
2003int trace_array_printk(struct trace_array *tr,
2004 unsigned long ip, const char *fmt, ...)
2005{
2006 int ret;
2007 va_list ap;
2008
2009 if (!(trace_flags & TRACE_ITER_PRINTK))
2010 return 0;
2011
2012 va_start(ap, fmt);
2013 ret = trace_array_vprintk(tr, ip, fmt, ap);
2014 va_end(ap);
2015 return ret;
2016}
2017
2018int trace_array_printk_buf(struct ring_buffer *buffer,
2019 unsigned long ip, const char *fmt, ...)
2020{
2021 int ret;
2022 va_list ap;
2023
2024 if (!(trace_flags & TRACE_ITER_PRINTK))
2025 return 0;
2026
2027 va_start(ap, fmt);
2028 ret = __trace_array_vprintk(buffer, ip, fmt, ap);
2029 va_end(ap);
2030 return ret;
2031}
2032
1770int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 2033int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1771{ 2034{
1772 return trace_array_vprintk(&global_trace, ip, fmt, args); 2035 return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1792,7 +2055,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1792 if (buf_iter) 2055 if (buf_iter)
1793 event = ring_buffer_iter_peek(buf_iter, ts); 2056 event = ring_buffer_iter_peek(buf_iter, ts);
1794 else 2057 else
1795 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 2058 event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
1796 lost_events); 2059 lost_events);
1797 2060
1798 if (event) { 2061 if (event) {
@@ -1807,7 +2070,7 @@ static struct trace_entry *
1807__find_next_entry(struct trace_iterator *iter, int *ent_cpu, 2070__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1808 unsigned long *missing_events, u64 *ent_ts) 2071 unsigned long *missing_events, u64 *ent_ts)
1809{ 2072{
1810 struct ring_buffer *buffer = iter->tr->buffer; 2073 struct ring_buffer *buffer = iter->trace_buffer->buffer;
1811 struct trace_entry *ent, *next = NULL; 2074 struct trace_entry *ent, *next = NULL;
1812 unsigned long lost_events = 0, next_lost = 0; 2075 unsigned long lost_events = 0, next_lost = 0;
1813 int cpu_file = iter->cpu_file; 2076 int cpu_file = iter->cpu_file;
@@ -1820,7 +2083,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1820 * If we are in a per_cpu trace file, don't bother by iterating over 2083 * If we are in a per_cpu trace file, don't bother by iterating over
1821 * all cpu and peek directly. 2084 * all cpu and peek directly.
1822 */ 2085 */
1823 if (cpu_file > TRACE_PIPE_ALL_CPU) { 2086 if (cpu_file > RING_BUFFER_ALL_CPUS) {
1824 if (ring_buffer_empty_cpu(buffer, cpu_file)) 2087 if (ring_buffer_empty_cpu(buffer, cpu_file))
1825 return NULL; 2088 return NULL;
1826 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); 2089 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1884,7 +2147,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1884 2147
1885static void trace_consume(struct trace_iterator *iter) 2148static void trace_consume(struct trace_iterator *iter)
1886{ 2149{
1887 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 2150 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
1888 &iter->lost_events); 2151 &iter->lost_events);
1889} 2152}
1890 2153
@@ -1917,13 +2180,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1917 2180
1918void tracing_iter_reset(struct trace_iterator *iter, int cpu) 2181void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1919{ 2182{
1920 struct trace_array *tr = iter->tr;
1921 struct ring_buffer_event *event; 2183 struct ring_buffer_event *event;
1922 struct ring_buffer_iter *buf_iter; 2184 struct ring_buffer_iter *buf_iter;
1923 unsigned long entries = 0; 2185 unsigned long entries = 0;
1924 u64 ts; 2186 u64 ts;
1925 2187
1926 tr->data[cpu]->skipped_entries = 0; 2188 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
1927 2189
1928 buf_iter = trace_buffer_iter(iter, cpu); 2190 buf_iter = trace_buffer_iter(iter, cpu);
1929 if (!buf_iter) 2191 if (!buf_iter)
@@ -1937,13 +2199,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1937 * by the timestamp being before the start of the buffer. 2199 * by the timestamp being before the start of the buffer.
1938 */ 2200 */
1939 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { 2201 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1940 if (ts >= iter->tr->time_start) 2202 if (ts >= iter->trace_buffer->time_start)
1941 break; 2203 break;
1942 entries++; 2204 entries++;
1943 ring_buffer_read(buf_iter, NULL); 2205 ring_buffer_read(buf_iter, NULL);
1944 } 2206 }
1945 2207
1946 tr->data[cpu]->skipped_entries = entries; 2208 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
1947} 2209}
1948 2210
1949/* 2211/*
@@ -1953,6 +2215,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1953static void *s_start(struct seq_file *m, loff_t *pos) 2215static void *s_start(struct seq_file *m, loff_t *pos)
1954{ 2216{
1955 struct trace_iterator *iter = m->private; 2217 struct trace_iterator *iter = m->private;
2218 struct trace_array *tr = iter->tr;
1956 int cpu_file = iter->cpu_file; 2219 int cpu_file = iter->cpu_file;
1957 void *p = NULL; 2220 void *p = NULL;
1958 loff_t l = 0; 2221 loff_t l = 0;
@@ -1965,12 +2228,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1965 * will point to the same string as current_trace->name. 2228 * will point to the same string as current_trace->name.
1966 */ 2229 */
1967 mutex_lock(&trace_types_lock); 2230 mutex_lock(&trace_types_lock);
1968 if (unlikely(current_trace && iter->trace->name != current_trace->name)) 2231 if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
1969 *iter->trace = *current_trace; 2232 *iter->trace = *tr->current_trace;
1970 mutex_unlock(&trace_types_lock); 2233 mutex_unlock(&trace_types_lock);
1971 2234
2235#ifdef CONFIG_TRACER_MAX_TRACE
1972 if (iter->snapshot && iter->trace->use_max_tr) 2236 if (iter->snapshot && iter->trace->use_max_tr)
1973 return ERR_PTR(-EBUSY); 2237 return ERR_PTR(-EBUSY);
2238#endif
1974 2239
1975 if (!iter->snapshot) 2240 if (!iter->snapshot)
1976 atomic_inc(&trace_record_cmdline_disabled); 2241 atomic_inc(&trace_record_cmdline_disabled);
@@ -1980,7 +2245,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1980 iter->cpu = 0; 2245 iter->cpu = 0;
1981 iter->idx = -1; 2246 iter->idx = -1;
1982 2247
1983 if (cpu_file == TRACE_PIPE_ALL_CPU) { 2248 if (cpu_file == RING_BUFFER_ALL_CPUS) {
1984 for_each_tracing_cpu(cpu) 2249 for_each_tracing_cpu(cpu)
1985 tracing_iter_reset(iter, cpu); 2250 tracing_iter_reset(iter, cpu);
1986 } else 2251 } else
@@ -2012,17 +2277,21 @@ static void s_stop(struct seq_file *m, void *p)
2012{ 2277{
2013 struct trace_iterator *iter = m->private; 2278 struct trace_iterator *iter = m->private;
2014 2279
2280#ifdef CONFIG_TRACER_MAX_TRACE
2015 if (iter->snapshot && iter->trace->use_max_tr) 2281 if (iter->snapshot && iter->trace->use_max_tr)
2016 return; 2282 return;
2283#endif
2017 2284
2018 if (!iter->snapshot) 2285 if (!iter->snapshot)
2019 atomic_dec(&trace_record_cmdline_disabled); 2286 atomic_dec(&trace_record_cmdline_disabled);
2287
2020 trace_access_unlock(iter->cpu_file); 2288 trace_access_unlock(iter->cpu_file);
2021 trace_event_read_unlock(); 2289 trace_event_read_unlock();
2022} 2290}
2023 2291
2024static void 2292static void
2025get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) 2293get_total_entries(struct trace_buffer *buf,
2294 unsigned long *total, unsigned long *entries)
2026{ 2295{
2027 unsigned long count; 2296 unsigned long count;
2028 int cpu; 2297 int cpu;
@@ -2031,19 +2300,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
2031 *entries = 0; 2300 *entries = 0;
2032 2301
2033 for_each_tracing_cpu(cpu) { 2302 for_each_tracing_cpu(cpu) {
2034 count = ring_buffer_entries_cpu(tr->buffer, cpu); 2303 count = ring_buffer_entries_cpu(buf->buffer, cpu);
2035 /* 2304 /*
2036 * If this buffer has skipped entries, then we hold all 2305 * If this buffer has skipped entries, then we hold all
2037 * entries for the trace and we need to ignore the 2306 * entries for the trace and we need to ignore the
2038 * ones before the time stamp. 2307 * ones before the time stamp.
2039 */ 2308 */
2040 if (tr->data[cpu]->skipped_entries) { 2309 if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
2041 count -= tr->data[cpu]->skipped_entries; 2310 count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
2042 /* total is the same as the entries */ 2311 /* total is the same as the entries */
2043 *total += count; 2312 *total += count;
2044 } else 2313 } else
2045 *total += count + 2314 *total += count +
2046 ring_buffer_overrun_cpu(tr->buffer, cpu); 2315 ring_buffer_overrun_cpu(buf->buffer, cpu);
2047 *entries += count; 2316 *entries += count;
2048 } 2317 }
2049} 2318}
@@ -2060,27 +2329,27 @@ static void print_lat_help_header(struct seq_file *m)
2060 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2329 seq_puts(m, "# \\ / ||||| \\ | / \n");
2061} 2330}
2062 2331
2063static void print_event_info(struct trace_array *tr, struct seq_file *m) 2332static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2064{ 2333{
2065 unsigned long total; 2334 unsigned long total;
2066 unsigned long entries; 2335 unsigned long entries;
2067 2336
2068 get_total_entries(tr, &total, &entries); 2337 get_total_entries(buf, &total, &entries);
2069 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", 2338 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
2070 entries, total, num_online_cpus()); 2339 entries, total, num_online_cpus());
2071 seq_puts(m, "#\n"); 2340 seq_puts(m, "#\n");
2072} 2341}
2073 2342
2074static void print_func_help_header(struct trace_array *tr, struct seq_file *m) 2343static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2075{ 2344{
2076 print_event_info(tr, m); 2345 print_event_info(buf, m);
2077 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2346 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
2078 seq_puts(m, "# | | | | |\n"); 2347 seq_puts(m, "# | | | | |\n");
2079} 2348}
2080 2349
2081static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) 2350static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2082{ 2351{
2083 print_event_info(tr, m); 2352 print_event_info(buf, m);
2084 seq_puts(m, "# _-----=> irqs-off\n"); 2353 seq_puts(m, "# _-----=> irqs-off\n");
2085 seq_puts(m, "# / _----=> need-resched\n"); 2354 seq_puts(m, "# / _----=> need-resched\n");
2086 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2355 seq_puts(m, "# | / _---=> hardirq/softirq\n");
@@ -2094,16 +2363,16 @@ void
2094print_trace_header(struct seq_file *m, struct trace_iterator *iter) 2363print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2095{ 2364{
2096 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 2365 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
2097 struct trace_array *tr = iter->tr; 2366 struct trace_buffer *buf = iter->trace_buffer;
2098 struct trace_array_cpu *data = tr->data[tr->cpu]; 2367 struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
2099 struct tracer *type = current_trace; 2368 struct tracer *type = iter->trace;
2100 unsigned long entries; 2369 unsigned long entries;
2101 unsigned long total; 2370 unsigned long total;
2102 const char *name = "preemption"; 2371 const char *name = "preemption";
2103 2372
2104 name = type->name; 2373 name = type->name;
2105 2374
2106 get_total_entries(tr, &total, &entries); 2375 get_total_entries(buf, &total, &entries);
2107 2376
2108 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 2377 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
2109 name, UTS_RELEASE); 2378 name, UTS_RELEASE);
@@ -2114,7 +2383,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2114 nsecs_to_usecs(data->saved_latency), 2383 nsecs_to_usecs(data->saved_latency),
2115 entries, 2384 entries,
2116 total, 2385 total,
2117 tr->cpu, 2386 buf->cpu,
2118#if defined(CONFIG_PREEMPT_NONE) 2387#if defined(CONFIG_PREEMPT_NONE)
2119 "server", 2388 "server",
2120#elif defined(CONFIG_PREEMPT_VOLUNTARY) 2389#elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2165,7 +2434,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
2165 if (cpumask_test_cpu(iter->cpu, iter->started)) 2434 if (cpumask_test_cpu(iter->cpu, iter->started))
2166 return; 2435 return;
2167 2436
2168 if (iter->tr->data[iter->cpu]->skipped_entries) 2437 if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
2169 return; 2438 return;
2170 2439
2171 cpumask_set_cpu(iter->cpu, iter->started); 2440 cpumask_set_cpu(iter->cpu, iter->started);
@@ -2288,14 +2557,14 @@ int trace_empty(struct trace_iterator *iter)
2288 int cpu; 2557 int cpu;
2289 2558
2290 /* If we are looking at one CPU buffer, only check that one */ 2559 /* If we are looking at one CPU buffer, only check that one */
2291 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2560 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
2292 cpu = iter->cpu_file; 2561 cpu = iter->cpu_file;
2293 buf_iter = trace_buffer_iter(iter, cpu); 2562 buf_iter = trace_buffer_iter(iter, cpu);
2294 if (buf_iter) { 2563 if (buf_iter) {
2295 if (!ring_buffer_iter_empty(buf_iter)) 2564 if (!ring_buffer_iter_empty(buf_iter))
2296 return 0; 2565 return 0;
2297 } else { 2566 } else {
2298 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2567 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2299 return 0; 2568 return 0;
2300 } 2569 }
2301 return 1; 2570 return 1;
@@ -2307,7 +2576,7 @@ int trace_empty(struct trace_iterator *iter)
2307 if (!ring_buffer_iter_empty(buf_iter)) 2576 if (!ring_buffer_iter_empty(buf_iter))
2308 return 0; 2577 return 0;
2309 } else { 2578 } else {
2310 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2579 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2311 return 0; 2580 return 0;
2312 } 2581 }
2313 } 2582 }
@@ -2331,6 +2600,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2331 return ret; 2600 return ret;
2332 } 2601 }
2333 2602
2603 if (iter->ent->type == TRACE_BPUTS &&
2604 trace_flags & TRACE_ITER_PRINTK &&
2605 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2606 return trace_print_bputs_msg_only(iter);
2607
2334 if (iter->ent->type == TRACE_BPRINT && 2608 if (iter->ent->type == TRACE_BPRINT &&
2335 trace_flags & TRACE_ITER_PRINTK && 2609 trace_flags & TRACE_ITER_PRINTK &&
2336 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 2610 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2385,9 +2659,9 @@ void trace_default_header(struct seq_file *m)
2385 } else { 2659 } else {
2386 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 2660 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2387 if (trace_flags & TRACE_ITER_IRQ_INFO) 2661 if (trace_flags & TRACE_ITER_IRQ_INFO)
2388 print_func_help_header_irq(iter->tr, m); 2662 print_func_help_header_irq(iter->trace_buffer, m);
2389 else 2663 else
2390 print_func_help_header(iter->tr, m); 2664 print_func_help_header(iter->trace_buffer, m);
2391 } 2665 }
2392 } 2666 }
2393} 2667}
@@ -2401,14 +2675,8 @@ static void test_ftrace_alive(struct seq_file *m)
2401} 2675}
2402 2676
2403#ifdef CONFIG_TRACER_MAX_TRACE 2677#ifdef CONFIG_TRACER_MAX_TRACE
2404static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2678static void show_snapshot_main_help(struct seq_file *m)
2405{ 2679{
2406 if (iter->trace->allocated_snapshot)
2407 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2408 else
2409 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2410
2411 seq_printf(m, "# Snapshot commands:\n");
2412 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2680 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2413 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2681 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2414 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2682 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
@@ -2416,6 +2684,35 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2416 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2684 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2417 seq_printf(m, "# is not a '0' or '1')\n"); 2685 seq_printf(m, "# is not a '0' or '1')\n");
2418} 2686}
2687
2688static void show_snapshot_percpu_help(struct seq_file *m)
2689{
2690 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2691#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2692 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2693 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
2694#else
2695 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
2696 seq_printf(m, "# Must use main snapshot file to allocate.\n");
2697#endif
2698 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
2699 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2700 seq_printf(m, "# is not a '0' or '1')\n");
2701}
2702
2703static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2704{
2705 if (iter->tr->allocated_snapshot)
2706 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2707 else
2708 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2709
2710 seq_printf(m, "# Snapshot commands:\n");
2711 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2712 show_snapshot_main_help(m);
2713 else
2714 show_snapshot_percpu_help(m);
2715}
2419#else 2716#else
2420/* Should never be called */ 2717/* Should never be called */
2421static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } 2718static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
@@ -2475,7 +2772,8 @@ static const struct seq_operations tracer_seq_ops = {
2475static struct trace_iterator * 2772static struct trace_iterator *
2476__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2773__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2477{ 2774{
2478 long cpu_file = (long) inode->i_private; 2775 struct trace_cpu *tc = inode->i_private;
2776 struct trace_array *tr = tc->tr;
2479 struct trace_iterator *iter; 2777 struct trace_iterator *iter;
2480 int cpu; 2778 int cpu;
2481 2779
@@ -2500,26 +2798,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2500 if (!iter->trace) 2798 if (!iter->trace)
2501 goto fail; 2799 goto fail;
2502 2800
2503 *iter->trace = *current_trace; 2801 *iter->trace = *tr->current_trace;
2504 2802
2505 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2803 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2506 goto fail; 2804 goto fail;
2507 2805
2508 if (current_trace->print_max || snapshot) 2806 iter->tr = tr;
2509 iter->tr = &max_tr; 2807
2808#ifdef CONFIG_TRACER_MAX_TRACE
2809 /* Currently only the top directory has a snapshot */
2810 if (tr->current_trace->print_max || snapshot)
2811 iter->trace_buffer = &tr->max_buffer;
2510 else 2812 else
2511 iter->tr = &global_trace; 2813#endif
2814 iter->trace_buffer = &tr->trace_buffer;
2512 iter->snapshot = snapshot; 2815 iter->snapshot = snapshot;
2513 iter->pos = -1; 2816 iter->pos = -1;
2514 mutex_init(&iter->mutex); 2817 mutex_init(&iter->mutex);
2515 iter->cpu_file = cpu_file; 2818 iter->cpu_file = tc->cpu;
2516 2819
2517 /* Notify the tracer early; before we stop tracing. */ 2820 /* Notify the tracer early; before we stop tracing. */
2518 if (iter->trace && iter->trace->open) 2821 if (iter->trace && iter->trace->open)
2519 iter->trace->open(iter); 2822 iter->trace->open(iter);
2520 2823
2521 /* Annotate start of buffers if we had overruns */ 2824 /* Annotate start of buffers if we had overruns */
2522 if (ring_buffer_overruns(iter->tr->buffer)) 2825 if (ring_buffer_overruns(iter->trace_buffer->buffer))
2523 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2826 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2524 2827
2525 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2828 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2528,12 +2831,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2528 2831
2529 /* stop the trace while dumping if we are not opening "snapshot" */ 2832 /* stop the trace while dumping if we are not opening "snapshot" */
2530 if (!iter->snapshot) 2833 if (!iter->snapshot)
2531 tracing_stop(); 2834 tracing_stop_tr(tr);
2532 2835
2533 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2836 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
2534 for_each_tracing_cpu(cpu) { 2837 for_each_tracing_cpu(cpu) {
2535 iter->buffer_iter[cpu] = 2838 iter->buffer_iter[cpu] =
2536 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2839 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2537 } 2840 }
2538 ring_buffer_read_prepare_sync(); 2841 ring_buffer_read_prepare_sync();
2539 for_each_tracing_cpu(cpu) { 2842 for_each_tracing_cpu(cpu) {
@@ -2543,12 +2846,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2543 } else { 2846 } else {
2544 cpu = iter->cpu_file; 2847 cpu = iter->cpu_file;
2545 iter->buffer_iter[cpu] = 2848 iter->buffer_iter[cpu] =
2546 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2849 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2547 ring_buffer_read_prepare_sync(); 2850 ring_buffer_read_prepare_sync();
2548 ring_buffer_read_start(iter->buffer_iter[cpu]); 2851 ring_buffer_read_start(iter->buffer_iter[cpu]);
2549 tracing_iter_reset(iter, cpu); 2852 tracing_iter_reset(iter, cpu);
2550 } 2853 }
2551 2854
2855 tr->ref++;
2856
2552 mutex_unlock(&trace_types_lock); 2857 mutex_unlock(&trace_types_lock);
2553 2858
2554 return iter; 2859 return iter;
@@ -2575,14 +2880,20 @@ static int tracing_release(struct inode *inode, struct file *file)
2575{ 2880{
2576 struct seq_file *m = file->private_data; 2881 struct seq_file *m = file->private_data;
2577 struct trace_iterator *iter; 2882 struct trace_iterator *iter;
2883 struct trace_array *tr;
2578 int cpu; 2884 int cpu;
2579 2885
2580 if (!(file->f_mode & FMODE_READ)) 2886 if (!(file->f_mode & FMODE_READ))
2581 return 0; 2887 return 0;
2582 2888
2583 iter = m->private; 2889 iter = m->private;
2890 tr = iter->tr;
2584 2891
2585 mutex_lock(&trace_types_lock); 2892 mutex_lock(&trace_types_lock);
2893
2894 WARN_ON(!tr->ref);
2895 tr->ref--;
2896
2586 for_each_tracing_cpu(cpu) { 2897 for_each_tracing_cpu(cpu) {
2587 if (iter->buffer_iter[cpu]) 2898 if (iter->buffer_iter[cpu])
2588 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2899 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2593,7 +2904,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2593 2904
2594 if (!iter->snapshot) 2905 if (!iter->snapshot)
2595 /* reenable tracing if it was previously enabled */ 2906 /* reenable tracing if it was previously enabled */
2596 tracing_start(); 2907 tracing_start_tr(tr);
2597 mutex_unlock(&trace_types_lock); 2908 mutex_unlock(&trace_types_lock);
2598 2909
2599 mutex_destroy(&iter->mutex); 2910 mutex_destroy(&iter->mutex);
@@ -2612,12 +2923,13 @@ static int tracing_open(struct inode *inode, struct file *file)
2612 /* If this file was open for write, then erase contents */ 2923 /* If this file was open for write, then erase contents */
2613 if ((file->f_mode & FMODE_WRITE) && 2924 if ((file->f_mode & FMODE_WRITE) &&
2614 (file->f_flags & O_TRUNC)) { 2925 (file->f_flags & O_TRUNC)) {
2615 long cpu = (long) inode->i_private; 2926 struct trace_cpu *tc = inode->i_private;
2927 struct trace_array *tr = tc->tr;
2616 2928
2617 if (cpu == TRACE_PIPE_ALL_CPU) 2929 if (tc->cpu == RING_BUFFER_ALL_CPUS)
2618 tracing_reset_online_cpus(&global_trace); 2930 tracing_reset_online_cpus(&tr->trace_buffer);
2619 else 2931 else
2620 tracing_reset(&global_trace, cpu); 2932 tracing_reset(&tr->trace_buffer, tc->cpu);
2621 } 2933 }
2622 2934
2623 if (file->f_mode & FMODE_READ) { 2935 if (file->f_mode & FMODE_READ) {
@@ -2764,8 +3076,9 @@ static ssize_t
2764tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3076tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2765 size_t count, loff_t *ppos) 3077 size_t count, loff_t *ppos)
2766{ 3078{
2767 int err, cpu; 3079 struct trace_array *tr = filp->private_data;
2768 cpumask_var_t tracing_cpumask_new; 3080 cpumask_var_t tracing_cpumask_new;
3081 int err, cpu;
2769 3082
2770 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 3083 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2771 return -ENOMEM; 3084 return -ENOMEM;
@@ -2785,13 +3098,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2785 */ 3098 */
2786 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3099 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2787 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3100 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2788 atomic_inc(&global_trace.data[cpu]->disabled); 3101 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2789 ring_buffer_record_disable_cpu(global_trace.buffer, cpu); 3102 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
2790 } 3103 }
2791 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3104 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2792 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3105 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2793 atomic_dec(&global_trace.data[cpu]->disabled); 3106 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2794 ring_buffer_record_enable_cpu(global_trace.buffer, cpu); 3107 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
2795 } 3108 }
2796 } 3109 }
2797 arch_spin_unlock(&ftrace_max_lock); 3110 arch_spin_unlock(&ftrace_max_lock);
@@ -2820,12 +3133,13 @@ static const struct file_operations tracing_cpumask_fops = {
2820static int tracing_trace_options_show(struct seq_file *m, void *v) 3133static int tracing_trace_options_show(struct seq_file *m, void *v)
2821{ 3134{
2822 struct tracer_opt *trace_opts; 3135 struct tracer_opt *trace_opts;
3136 struct trace_array *tr = m->private;
2823 u32 tracer_flags; 3137 u32 tracer_flags;
2824 int i; 3138 int i;
2825 3139
2826 mutex_lock(&trace_types_lock); 3140 mutex_lock(&trace_types_lock);
2827 tracer_flags = current_trace->flags->val; 3141 tracer_flags = tr->current_trace->flags->val;
2828 trace_opts = current_trace->flags->opts; 3142 trace_opts = tr->current_trace->flags->opts;
2829 3143
2830 for (i = 0; trace_options[i]; i++) { 3144 for (i = 0; trace_options[i]; i++) {
2831 if (trace_flags & (1 << i)) 3145 if (trace_flags & (1 << i))
@@ -2880,11 +3194,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2880 return -EINVAL; 3194 return -EINVAL;
2881} 3195}
2882 3196
2883static void set_tracer_flags(unsigned int mask, int enabled) 3197/* Some tracers require overwrite to stay enabled */
3198int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
3199{
3200 if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
3201 return -1;
3202
3203 return 0;
3204}
3205
3206int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
2884{ 3207{
2885 /* do nothing if flag is already set */ 3208 /* do nothing if flag is already set */
2886 if (!!(trace_flags & mask) == !!enabled) 3209 if (!!(trace_flags & mask) == !!enabled)
2887 return; 3210 return 0;
3211
3212 /* Give the tracer a chance to approve the change */
3213 if (tr->current_trace->flag_changed)
3214 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
3215 return -EINVAL;
2888 3216
2889 if (enabled) 3217 if (enabled)
2890 trace_flags |= mask; 3218 trace_flags |= mask;
@@ -2894,18 +3222,24 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2894 if (mask == TRACE_ITER_RECORD_CMD) 3222 if (mask == TRACE_ITER_RECORD_CMD)
2895 trace_event_enable_cmd_record(enabled); 3223 trace_event_enable_cmd_record(enabled);
2896 3224
2897 if (mask == TRACE_ITER_OVERWRITE) 3225 if (mask == TRACE_ITER_OVERWRITE) {
2898 ring_buffer_change_overwrite(global_trace.buffer, enabled); 3226 ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
3227#ifdef CONFIG_TRACER_MAX_TRACE
3228 ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
3229#endif
3230 }
2899 3231
2900 if (mask == TRACE_ITER_PRINTK) 3232 if (mask == TRACE_ITER_PRINTK)
2901 trace_printk_start_stop_comm(enabled); 3233 trace_printk_start_stop_comm(enabled);
3234
3235 return 0;
2902} 3236}
2903 3237
2904static int trace_set_options(char *option) 3238static int trace_set_options(struct trace_array *tr, char *option)
2905{ 3239{
2906 char *cmp; 3240 char *cmp;
2907 int neg = 0; 3241 int neg = 0;
2908 int ret = 0; 3242 int ret = -ENODEV;
2909 int i; 3243 int i;
2910 3244
2911 cmp = strstrip(option); 3245 cmp = strstrip(option);
@@ -2915,19 +3249,20 @@ static int trace_set_options(char *option)
2915 cmp += 2; 3249 cmp += 2;
2916 } 3250 }
2917 3251
3252 mutex_lock(&trace_types_lock);
3253
2918 for (i = 0; trace_options[i]; i++) { 3254 for (i = 0; trace_options[i]; i++) {
2919 if (strcmp(cmp, trace_options[i]) == 0) { 3255 if (strcmp(cmp, trace_options[i]) == 0) {
2920 set_tracer_flags(1 << i, !neg); 3256 ret = set_tracer_flag(tr, 1 << i, !neg);
2921 break; 3257 break;
2922 } 3258 }
2923 } 3259 }
2924 3260
2925 /* If no option could be set, test the specific tracer options */ 3261 /* If no option could be set, test the specific tracer options */
2926 if (!trace_options[i]) { 3262 if (!trace_options[i])
2927 mutex_lock(&trace_types_lock); 3263 ret = set_tracer_option(tr->current_trace, cmp, neg);
2928 ret = set_tracer_option(current_trace, cmp, neg); 3264
2929 mutex_unlock(&trace_types_lock); 3265 mutex_unlock(&trace_types_lock);
2930 }
2931 3266
2932 return ret; 3267 return ret;
2933} 3268}
@@ -2936,7 +3271,10 @@ static ssize_t
2936tracing_trace_options_write(struct file *filp, const char __user *ubuf, 3271tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2937 size_t cnt, loff_t *ppos) 3272 size_t cnt, loff_t *ppos)
2938{ 3273{
3274 struct seq_file *m = filp->private_data;
3275 struct trace_array *tr = m->private;
2939 char buf[64]; 3276 char buf[64];
3277 int ret;
2940 3278
2941 if (cnt >= sizeof(buf)) 3279 if (cnt >= sizeof(buf))
2942 return -EINVAL; 3280 return -EINVAL;
@@ -2946,7 +3284,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2946 3284
2947 buf[cnt] = 0; 3285 buf[cnt] = 0;
2948 3286
2949 trace_set_options(buf); 3287 ret = trace_set_options(tr, buf);
3288 if (ret < 0)
3289 return ret;
2950 3290
2951 *ppos += cnt; 3291 *ppos += cnt;
2952 3292
@@ -2957,7 +3297,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
2957{ 3297{
2958 if (tracing_disabled) 3298 if (tracing_disabled)
2959 return -ENODEV; 3299 return -ENODEV;
2960 return single_open(file, tracing_trace_options_show, NULL); 3300
3301 return single_open(file, tracing_trace_options_show, inode->i_private);
2961} 3302}
2962 3303
2963static const struct file_operations tracing_iter_fops = { 3304static const struct file_operations tracing_iter_fops = {
@@ -2970,20 +3311,84 @@ static const struct file_operations tracing_iter_fops = {
2970 3311
2971static const char readme_msg[] = 3312static const char readme_msg[] =
2972 "tracing mini-HOWTO:\n\n" 3313 "tracing mini-HOWTO:\n\n"
2973 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 3314 "# echo 0 > tracing_on : quick way to disable tracing\n"
2974 "# cat /sys/kernel/debug/tracing/available_tracers\n" 3315 "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
2975 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" 3316 " Important files:\n"
2976 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3317 " trace\t\t\t- The static contents of the buffer\n"
2977 "nop\n" 3318 "\t\t\t To clear the buffer write into this file: echo > trace\n"
2978 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" 3319 " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
2979 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3320 " current_tracer\t- function and latency tracers\n"
2980 "wakeup\n" 3321 " available_tracers\t- list of configured tracers for current_tracer\n"
2981 "# cat /sys/kernel/debug/tracing/trace_options\n" 3322 " buffer_size_kb\t- view and modify size of per cpu buffer\n"
2982 "noprint-parent nosym-offset nosym-addr noverbose\n" 3323 " buffer_total_size_kb - view total size of all cpu buffers\n\n"
2983 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 3324 " trace_clock\t\t-change the clock used to order events\n"
2984 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" 3325 " local: Per cpu clock but may not be synced across CPUs\n"
2985 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 3326 " global: Synced across CPUs but slows tracing down.\n"
2986 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" 3327 " counter: Not a clock, but just an increment\n"
3328 " uptime: Jiffy counter from time of boot\n"
3329 " perf: Same clock that perf events use\n"
3330#ifdef CONFIG_X86_64
3331 " x86-tsc: TSC cycle counter\n"
3332#endif
3333 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
3334 " tracing_cpumask\t- Limit which CPUs to trace\n"
3335 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
3336 "\t\t\t Remove sub-buffer with rmdir\n"
3337 " trace_options\t\t- Set format or modify how tracing happens\n"
3338 "\t\t\t Disable an option by adding a suffix 'no' to the option name\n"
3339#ifdef CONFIG_DYNAMIC_FTRACE
3340 "\n available_filter_functions - list of functions that can be filtered on\n"
3341 " set_ftrace_filter\t- echo function name in here to only trace these functions\n"
3342 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3343 " modules: Can select a group via module\n"
3344 " Format: :mod:<module-name>\n"
3345 " example: echo :mod:ext3 > set_ftrace_filter\n"
3346 " triggers: a command to perform when function is hit\n"
3347 " Format: <function>:<trigger>[:count]\n"
3348 " trigger: traceon, traceoff\n"
3349 " enable_event:<system>:<event>\n"
3350 " disable_event:<system>:<event>\n"
3351#ifdef CONFIG_STACKTRACE
3352 " stacktrace\n"
3353#endif
3354#ifdef CONFIG_TRACER_SNAPSHOT
3355 " snapshot\n"
3356#endif
3357 " example: echo do_fault:traceoff > set_ftrace_filter\n"
3358 " echo do_trap:traceoff:3 > set_ftrace_filter\n"
3359 " The first one will disable tracing every time do_fault is hit\n"
3360 " The second will disable tracing at most 3 times when do_trap is hit\n"
3361 " The first time do trap is hit and it disables tracing, the counter\n"
3362 " will decrement to 2. If tracing is already disabled, the counter\n"
3363 " will not decrement. It only decrements when the trigger did work\n"
3364 " To remove trigger without count:\n"
3365 " echo '!<function>:<trigger> > set_ftrace_filter\n"
3366 " To remove trigger with a count:\n"
3367 " echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
3368 " set_ftrace_notrace\t- echo function name in here to never trace.\n"
3369 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3370 " modules: Can select a group via module command :mod:\n"
3371 " Does not accept triggers\n"
3372#endif /* CONFIG_DYNAMIC_FTRACE */
3373#ifdef CONFIG_FUNCTION_TRACER
3374 " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
3375#endif
3376#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3377 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3378 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3379#endif
3380#ifdef CONFIG_TRACER_SNAPSHOT
3381 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3382 "\t\t\t Read the contents for more information\n"
3383#endif
3384#ifdef CONFIG_STACKTRACE
3385 " stack_trace\t\t- Shows the max stack trace when active\n"
3386 " stack_max_size\t- Shows current max stack size that was traced\n"
3387 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3388#ifdef CONFIG_DYNAMIC_FTRACE
3389 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3390#endif
3391#endif /* CONFIG_STACKTRACE */
2987; 3392;
2988 3393
2989static ssize_t 3394static ssize_t
@@ -3055,11 +3460,12 @@ static ssize_t
3055tracing_set_trace_read(struct file *filp, char __user *ubuf, 3460tracing_set_trace_read(struct file *filp, char __user *ubuf,
3056 size_t cnt, loff_t *ppos) 3461 size_t cnt, loff_t *ppos)
3057{ 3462{
3463 struct trace_array *tr = filp->private_data;
3058 char buf[MAX_TRACER_SIZE+2]; 3464 char buf[MAX_TRACER_SIZE+2];
3059 int r; 3465 int r;
3060 3466
3061 mutex_lock(&trace_types_lock); 3467 mutex_lock(&trace_types_lock);
3062 r = sprintf(buf, "%s\n", current_trace->name); 3468 r = sprintf(buf, "%s\n", tr->current_trace->name);
3063 mutex_unlock(&trace_types_lock); 3469 mutex_unlock(&trace_types_lock);
3064 3470
3065 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3471 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3067,43 +3473,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3067 3473
3068int tracer_init(struct tracer *t, struct trace_array *tr) 3474int tracer_init(struct tracer *t, struct trace_array *tr)
3069{ 3475{
3070 tracing_reset_online_cpus(tr); 3476 tracing_reset_online_cpus(&tr->trace_buffer);
3071 return t->init(tr); 3477 return t->init(tr);
3072} 3478}
3073 3479
3074static void set_buffer_entries(struct trace_array *tr, unsigned long val) 3480static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
3075{ 3481{
3076 int cpu; 3482 int cpu;
3483
3077 for_each_tracing_cpu(cpu) 3484 for_each_tracing_cpu(cpu)
3078 tr->data[cpu]->entries = val; 3485 per_cpu_ptr(buf->data, cpu)->entries = val;
3079} 3486}
3080 3487
3488#ifdef CONFIG_TRACER_MAX_TRACE
3081/* resize @tr's buffer to the size of @size_tr's entries */ 3489/* resize @tr's buffer to the size of @size_tr's entries */
3082static int resize_buffer_duplicate_size(struct trace_array *tr, 3490static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
3083 struct trace_array *size_tr, int cpu_id) 3491 struct trace_buffer *size_buf, int cpu_id)
3084{ 3492{
3085 int cpu, ret = 0; 3493 int cpu, ret = 0;
3086 3494
3087 if (cpu_id == RING_BUFFER_ALL_CPUS) { 3495 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3088 for_each_tracing_cpu(cpu) { 3496 for_each_tracing_cpu(cpu) {
3089 ret = ring_buffer_resize(tr->buffer, 3497 ret = ring_buffer_resize(trace_buf->buffer,
3090 size_tr->data[cpu]->entries, cpu); 3498 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
3091 if (ret < 0) 3499 if (ret < 0)
3092 break; 3500 break;
3093 tr->data[cpu]->entries = size_tr->data[cpu]->entries; 3501 per_cpu_ptr(trace_buf->data, cpu)->entries =
3502 per_cpu_ptr(size_buf->data, cpu)->entries;
3094 } 3503 }
3095 } else { 3504 } else {
3096 ret = ring_buffer_resize(tr->buffer, 3505 ret = ring_buffer_resize(trace_buf->buffer,
3097 size_tr->data[cpu_id]->entries, cpu_id); 3506 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
3098 if (ret == 0) 3507 if (ret == 0)
3099 tr->data[cpu_id]->entries = 3508 per_cpu_ptr(trace_buf->data, cpu_id)->entries =
3100 size_tr->data[cpu_id]->entries; 3509 per_cpu_ptr(size_buf->data, cpu_id)->entries;
3101 } 3510 }
3102 3511
3103 return ret; 3512 return ret;
3104} 3513}
3514#endif /* CONFIG_TRACER_MAX_TRACE */
3105 3515
3106static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3516static int __tracing_resize_ring_buffer(struct trace_array *tr,
3517 unsigned long size, int cpu)
3107{ 3518{
3108 int ret; 3519 int ret;
3109 3520
@@ -3112,23 +3523,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3112 * we use the size that was given, and we can forget about 3523 * we use the size that was given, and we can forget about
3113 * expanding it later. 3524 * expanding it later.
3114 */ 3525 */
3115 ring_buffer_expanded = 1; 3526 ring_buffer_expanded = true;
3116 3527
3117 /* May be called before buffers are initialized */ 3528 /* May be called before buffers are initialized */
3118 if (!global_trace.buffer) 3529 if (!tr->trace_buffer.buffer)
3119 return 0; 3530 return 0;
3120 3531
3121 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3532 ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
3122 if (ret < 0) 3533 if (ret < 0)
3123 return ret; 3534 return ret;
3124 3535
3125 if (!current_trace->use_max_tr) 3536#ifdef CONFIG_TRACER_MAX_TRACE
3537 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
3538 !tr->current_trace->use_max_tr)
3126 goto out; 3539 goto out;
3127 3540
3128 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3541 ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
3129 if (ret < 0) { 3542 if (ret < 0) {
3130 int r = resize_buffer_duplicate_size(&global_trace, 3543 int r = resize_buffer_duplicate_size(&tr->trace_buffer,
3131 &global_trace, cpu); 3544 &tr->trace_buffer, cpu);
3132 if (r < 0) { 3545 if (r < 0) {
3133 /* 3546 /*
3134 * AARGH! We are left with different 3547 * AARGH! We are left with different
@@ -3151,20 +3564,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3151 } 3564 }
3152 3565
3153 if (cpu == RING_BUFFER_ALL_CPUS) 3566 if (cpu == RING_BUFFER_ALL_CPUS)
3154 set_buffer_entries(&max_tr, size); 3567 set_buffer_entries(&tr->max_buffer, size);
3155 else 3568 else
3156 max_tr.data[cpu]->entries = size; 3569 per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
3157 3570
3158 out: 3571 out:
3572#endif /* CONFIG_TRACER_MAX_TRACE */
3573
3159 if (cpu == RING_BUFFER_ALL_CPUS) 3574 if (cpu == RING_BUFFER_ALL_CPUS)
3160 set_buffer_entries(&global_trace, size); 3575 set_buffer_entries(&tr->trace_buffer, size);
3161 else 3576 else
3162 global_trace.data[cpu]->entries = size; 3577 per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
3163 3578
3164 return ret; 3579 return ret;
3165} 3580}
3166 3581
3167static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) 3582static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
3583 unsigned long size, int cpu_id)
3168{ 3584{
3169 int ret = size; 3585 int ret = size;
3170 3586
@@ -3178,7 +3594,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3178 } 3594 }
3179 } 3595 }
3180 3596
3181 ret = __tracing_resize_ring_buffer(size, cpu_id); 3597 ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
3182 if (ret < 0) 3598 if (ret < 0)
3183 ret = -ENOMEM; 3599 ret = -ENOMEM;
3184 3600
@@ -3205,7 +3621,7 @@ int tracing_update_buffers(void)
3205 3621
3206 mutex_lock(&trace_types_lock); 3622 mutex_lock(&trace_types_lock);
3207 if (!ring_buffer_expanded) 3623 if (!ring_buffer_expanded)
3208 ret = __tracing_resize_ring_buffer(trace_buf_size, 3624 ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
3209 RING_BUFFER_ALL_CPUS); 3625 RING_BUFFER_ALL_CPUS);
3210 mutex_unlock(&trace_types_lock); 3626 mutex_unlock(&trace_types_lock);
3211 3627
@@ -3215,7 +3631,7 @@ int tracing_update_buffers(void)
3215struct trace_option_dentry; 3631struct trace_option_dentry;
3216 3632
3217static struct trace_option_dentry * 3633static struct trace_option_dentry *
3218create_trace_option_files(struct tracer *tracer); 3634create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3219 3635
3220static void 3636static void
3221destroy_trace_option_files(struct trace_option_dentry *topts); 3637destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3225,13 +3641,15 @@ static int tracing_set_tracer(const char *buf)
3225 static struct trace_option_dentry *topts; 3641 static struct trace_option_dentry *topts;
3226 struct trace_array *tr = &global_trace; 3642 struct trace_array *tr = &global_trace;
3227 struct tracer *t; 3643 struct tracer *t;
3644#ifdef CONFIG_TRACER_MAX_TRACE
3228 bool had_max_tr; 3645 bool had_max_tr;
3646#endif
3229 int ret = 0; 3647 int ret = 0;
3230 3648
3231 mutex_lock(&trace_types_lock); 3649 mutex_lock(&trace_types_lock);
3232 3650
3233 if (!ring_buffer_expanded) { 3651 if (!ring_buffer_expanded) {
3234 ret = __tracing_resize_ring_buffer(trace_buf_size, 3652 ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
3235 RING_BUFFER_ALL_CPUS); 3653 RING_BUFFER_ALL_CPUS);
3236 if (ret < 0) 3654 if (ret < 0)
3237 goto out; 3655 goto out;
@@ -3246,15 +3664,21 @@ static int tracing_set_tracer(const char *buf)
3246 ret = -EINVAL; 3664 ret = -EINVAL;
3247 goto out; 3665 goto out;
3248 } 3666 }
3249 if (t == current_trace) 3667 if (t == tr->current_trace)
3250 goto out; 3668 goto out;
3251 3669
3252 trace_branch_disable(); 3670 trace_branch_disable();
3253 if (current_trace->reset)
3254 current_trace->reset(tr);
3255 3671
3256 had_max_tr = current_trace->allocated_snapshot; 3672 tr->current_trace->enabled = false;
3257 current_trace = &nop_trace; 3673
3674 if (tr->current_trace->reset)
3675 tr->current_trace->reset(tr);
3676
3677 /* Current trace needs to be nop_trace before synchronize_sched */
3678 tr->current_trace = &nop_trace;
3679
3680#ifdef CONFIG_TRACER_MAX_TRACE
3681 had_max_tr = tr->allocated_snapshot;
3258 3682
3259 if (had_max_tr && !t->use_max_tr) { 3683 if (had_max_tr && !t->use_max_tr) {
3260 /* 3684 /*
@@ -3265,27 +3689,20 @@ static int tracing_set_tracer(const char *buf)
3265 * so a synchronized_sched() is sufficient. 3689 * so a synchronized_sched() is sufficient.
3266 */ 3690 */
3267 synchronize_sched(); 3691 synchronize_sched();
3268 /* 3692 free_snapshot(tr);
3269 * We don't free the ring buffer. instead, resize it because
3270 * The max_tr ring buffer has some state (e.g. ring->clock) and
3271 * we want preserve it.
3272 */
3273 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3274 set_buffer_entries(&max_tr, 1);
3275 tracing_reset_online_cpus(&max_tr);
3276 current_trace->allocated_snapshot = false;
3277 } 3693 }
3694#endif
3278 destroy_trace_option_files(topts); 3695 destroy_trace_option_files(topts);
3279 3696
3280 topts = create_trace_option_files(t); 3697 topts = create_trace_option_files(tr, t);
3698
3699#ifdef CONFIG_TRACER_MAX_TRACE
3281 if (t->use_max_tr && !had_max_tr) { 3700 if (t->use_max_tr && !had_max_tr) {
3282 /* we need to make per cpu buffer sizes equivalent */ 3701 ret = alloc_snapshot(tr);
3283 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3284 RING_BUFFER_ALL_CPUS);
3285 if (ret < 0) 3702 if (ret < 0)
3286 goto out; 3703 goto out;
3287 t->allocated_snapshot = true;
3288 } 3704 }
3705#endif
3289 3706
3290 if (t->init) { 3707 if (t->init) {
3291 ret = tracer_init(t, tr); 3708 ret = tracer_init(t, tr);
@@ -3293,7 +3710,8 @@ static int tracing_set_tracer(const char *buf)
3293 goto out; 3710 goto out;
3294 } 3711 }
3295 3712
3296 current_trace = t; 3713 tr->current_trace = t;
3714 tr->current_trace->enabled = true;
3297 trace_branch_enable(tr); 3715 trace_branch_enable(tr);
3298 out: 3716 out:
3299 mutex_unlock(&trace_types_lock); 3717 mutex_unlock(&trace_types_lock);
@@ -3367,7 +3785,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3367 3785
3368static int tracing_open_pipe(struct inode *inode, struct file *filp) 3786static int tracing_open_pipe(struct inode *inode, struct file *filp)
3369{ 3787{
3370 long cpu_file = (long) inode->i_private; 3788 struct trace_cpu *tc = inode->i_private;
3789 struct trace_array *tr = tc->tr;
3371 struct trace_iterator *iter; 3790 struct trace_iterator *iter;
3372 int ret = 0; 3791 int ret = 0;
3373 3792
@@ -3392,7 +3811,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3392 ret = -ENOMEM; 3811 ret = -ENOMEM;
3393 goto fail; 3812 goto fail;
3394 } 3813 }
3395 *iter->trace = *current_trace; 3814 *iter->trace = *tr->current_trace;
3396 3815
3397 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3816 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3398 ret = -ENOMEM; 3817 ret = -ENOMEM;
@@ -3409,8 +3828,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3409 if (trace_clocks[trace_clock_id].in_ns) 3828 if (trace_clocks[trace_clock_id].in_ns)
3410 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3829 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3411 3830
3412 iter->cpu_file = cpu_file; 3831 iter->cpu_file = tc->cpu;
3413 iter->tr = &global_trace; 3832 iter->tr = tc->tr;
3833 iter->trace_buffer = &tc->tr->trace_buffer;
3414 mutex_init(&iter->mutex); 3834 mutex_init(&iter->mutex);
3415 filp->private_data = iter; 3835 filp->private_data = iter;
3416 3836
@@ -3449,24 +3869,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3449} 3869}
3450 3870
3451static unsigned int 3871static unsigned int
3452tracing_poll_pipe(struct file *filp, poll_table *poll_table) 3872trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
3453{ 3873{
3454 struct trace_iterator *iter = filp->private_data; 3874 /* Iterators are static, they should be filled or empty */
3875 if (trace_buffer_iter(iter, iter->cpu_file))
3876 return POLLIN | POLLRDNORM;
3455 3877
3456 if (trace_flags & TRACE_ITER_BLOCK) { 3878 if (trace_flags & TRACE_ITER_BLOCK)
3457 /* 3879 /*
3458 * Always select as readable when in blocking mode 3880 * Always select as readable when in blocking mode
3459 */ 3881 */
3460 return POLLIN | POLLRDNORM; 3882 return POLLIN | POLLRDNORM;
3461 } else { 3883 else
3462 if (!trace_empty(iter)) 3884 return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
3463 return POLLIN | POLLRDNORM; 3885 filp, poll_table);
3464 poll_wait(filp, &trace_wait, poll_table); 3886}
3465 if (!trace_empty(iter))
3466 return POLLIN | POLLRDNORM;
3467 3887
3468 return 0; 3888static unsigned int
3469 } 3889tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3890{
3891 struct trace_iterator *iter = filp->private_data;
3892
3893 return trace_poll(iter, filp, poll_table);
3470} 3894}
3471 3895
3472/* 3896/*
@@ -3532,6 +3956,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3532 size_t cnt, loff_t *ppos) 3956 size_t cnt, loff_t *ppos)
3533{ 3957{
3534 struct trace_iterator *iter = filp->private_data; 3958 struct trace_iterator *iter = filp->private_data;
3959 struct trace_array *tr = iter->tr;
3535 ssize_t sret; 3960 ssize_t sret;
3536 3961
3537 /* return any leftover data */ 3962 /* return any leftover data */
@@ -3543,8 +3968,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3543 3968
3544 /* copy the tracer to avoid using a global lock all around */ 3969 /* copy the tracer to avoid using a global lock all around */
3545 mutex_lock(&trace_types_lock); 3970 mutex_lock(&trace_types_lock);
3546 if (unlikely(iter->trace->name != current_trace->name)) 3971 if (unlikely(iter->trace->name != tr->current_trace->name))
3547 *iter->trace = *current_trace; 3972 *iter->trace = *tr->current_trace;
3548 mutex_unlock(&trace_types_lock); 3973 mutex_unlock(&trace_types_lock);
3549 3974
3550 /* 3975 /*
@@ -3700,6 +4125,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3700 .ops = &tracing_pipe_buf_ops, 4125 .ops = &tracing_pipe_buf_ops,
3701 .spd_release = tracing_spd_release_pipe, 4126 .spd_release = tracing_spd_release_pipe,
3702 }; 4127 };
4128 struct trace_array *tr = iter->tr;
3703 ssize_t ret; 4129 ssize_t ret;
3704 size_t rem; 4130 size_t rem;
3705 unsigned int i; 4131 unsigned int i;
@@ -3709,8 +4135,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3709 4135
3710 /* copy the tracer to avoid using a global lock all around */ 4136 /* copy the tracer to avoid using a global lock all around */
3711 mutex_lock(&trace_types_lock); 4137 mutex_lock(&trace_types_lock);
3712 if (unlikely(iter->trace->name != current_trace->name)) 4138 if (unlikely(iter->trace->name != tr->current_trace->name))
3713 *iter->trace = *current_trace; 4139 *iter->trace = *tr->current_trace;
3714 mutex_unlock(&trace_types_lock); 4140 mutex_unlock(&trace_types_lock);
3715 4141
3716 mutex_lock(&iter->mutex); 4142 mutex_lock(&iter->mutex);
@@ -3772,43 +4198,19 @@ out_err:
3772 goto out; 4198 goto out;
3773} 4199}
3774 4200
3775struct ftrace_entries_info {
3776 struct trace_array *tr;
3777 int cpu;
3778};
3779
3780static int tracing_entries_open(struct inode *inode, struct file *filp)
3781{
3782 struct ftrace_entries_info *info;
3783
3784 if (tracing_disabled)
3785 return -ENODEV;
3786
3787 info = kzalloc(sizeof(*info), GFP_KERNEL);
3788 if (!info)
3789 return -ENOMEM;
3790
3791 info->tr = &global_trace;
3792 info->cpu = (unsigned long)inode->i_private;
3793
3794 filp->private_data = info;
3795
3796 return 0;
3797}
3798
3799static ssize_t 4201static ssize_t
3800tracing_entries_read(struct file *filp, char __user *ubuf, 4202tracing_entries_read(struct file *filp, char __user *ubuf,
3801 size_t cnt, loff_t *ppos) 4203 size_t cnt, loff_t *ppos)
3802{ 4204{
3803 struct ftrace_entries_info *info = filp->private_data; 4205 struct trace_cpu *tc = filp->private_data;
3804 struct trace_array *tr = info->tr; 4206 struct trace_array *tr = tc->tr;
3805 char buf[64]; 4207 char buf[64];
3806 int r = 0; 4208 int r = 0;
3807 ssize_t ret; 4209 ssize_t ret;
3808 4210
3809 mutex_lock(&trace_types_lock); 4211 mutex_lock(&trace_types_lock);
3810 4212
3811 if (info->cpu == RING_BUFFER_ALL_CPUS) { 4213 if (tc->cpu == RING_BUFFER_ALL_CPUS) {
3812 int cpu, buf_size_same; 4214 int cpu, buf_size_same;
3813 unsigned long size; 4215 unsigned long size;
3814 4216
@@ -3818,8 +4220,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3818 for_each_tracing_cpu(cpu) { 4220 for_each_tracing_cpu(cpu) {
3819 /* fill in the size from first enabled cpu */ 4221 /* fill in the size from first enabled cpu */
3820 if (size == 0) 4222 if (size == 0)
3821 size = tr->data[cpu]->entries; 4223 size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
3822 if (size != tr->data[cpu]->entries) { 4224 if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
3823 buf_size_same = 0; 4225 buf_size_same = 0;
3824 break; 4226 break;
3825 } 4227 }
@@ -3835,7 +4237,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3835 } else 4237 } else
3836 r = sprintf(buf, "X\n"); 4238 r = sprintf(buf, "X\n");
3837 } else 4239 } else
3838 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); 4240 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
3839 4241
3840 mutex_unlock(&trace_types_lock); 4242 mutex_unlock(&trace_types_lock);
3841 4243
@@ -3847,7 +4249,7 @@ static ssize_t
3847tracing_entries_write(struct file *filp, const char __user *ubuf, 4249tracing_entries_write(struct file *filp, const char __user *ubuf,
3848 size_t cnt, loff_t *ppos) 4250 size_t cnt, loff_t *ppos)
3849{ 4251{
3850 struct ftrace_entries_info *info = filp->private_data; 4252 struct trace_cpu *tc = filp->private_data;
3851 unsigned long val; 4253 unsigned long val;
3852 int ret; 4254 int ret;
3853 4255
@@ -3862,7 +4264,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3862 /* value is in KB */ 4264 /* value is in KB */
3863 val <<= 10; 4265 val <<= 10;
3864 4266
3865 ret = tracing_resize_ring_buffer(val, info->cpu); 4267 ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
3866 if (ret < 0) 4268 if (ret < 0)
3867 return ret; 4269 return ret;
3868 4270
@@ -3871,16 +4273,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3871 return cnt; 4273 return cnt;
3872} 4274}
3873 4275
3874static int
3875tracing_entries_release(struct inode *inode, struct file *filp)
3876{
3877 struct ftrace_entries_info *info = filp->private_data;
3878
3879 kfree(info);
3880
3881 return 0;
3882}
3883
3884static ssize_t 4276static ssize_t
3885tracing_total_entries_read(struct file *filp, char __user *ubuf, 4277tracing_total_entries_read(struct file *filp, char __user *ubuf,
3886 size_t cnt, loff_t *ppos) 4278 size_t cnt, loff_t *ppos)
@@ -3892,7 +4284,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3892 4284
3893 mutex_lock(&trace_types_lock); 4285 mutex_lock(&trace_types_lock);
3894 for_each_tracing_cpu(cpu) { 4286 for_each_tracing_cpu(cpu) {
3895 size += tr->data[cpu]->entries >> 10; 4287 size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
3896 if (!ring_buffer_expanded) 4288 if (!ring_buffer_expanded)
3897 expanded_size += trace_buf_size >> 10; 4289 expanded_size += trace_buf_size >> 10;
3898 } 4290 }
@@ -3922,11 +4314,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3922static int 4314static int
3923tracing_free_buffer_release(struct inode *inode, struct file *filp) 4315tracing_free_buffer_release(struct inode *inode, struct file *filp)
3924{ 4316{
4317 struct trace_array *tr = inode->i_private;
4318
3925 /* disable tracing ? */ 4319 /* disable tracing ? */
3926 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 4320 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3927 tracing_off(); 4321 tracing_off();
3928 /* resize the ring buffer to 0 */ 4322 /* resize the ring buffer to 0 */
3929 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); 4323 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
3930 4324
3931 return 0; 4325 return 0;
3932} 4326}
@@ -3995,7 +4389,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3995 4389
3996 local_save_flags(irq_flags); 4390 local_save_flags(irq_flags);
3997 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4391 size = sizeof(*entry) + cnt + 2; /* possible \n added */
3998 buffer = global_trace.buffer; 4392 buffer = global_trace.trace_buffer.buffer;
3999 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4393 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
4000 irq_flags, preempt_count()); 4394 irq_flags, preempt_count());
4001 if (!event) { 4395 if (!event) {
@@ -4037,13 +4431,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4037 4431
4038static int tracing_clock_show(struct seq_file *m, void *v) 4432static int tracing_clock_show(struct seq_file *m, void *v)
4039{ 4433{
4434 struct trace_array *tr = m->private;
4040 int i; 4435 int i;
4041 4436
4042 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 4437 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
4043 seq_printf(m, 4438 seq_printf(m,
4044 "%s%s%s%s", i ? " " : "", 4439 "%s%s%s%s", i ? " " : "",
4045 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 4440 i == tr->clock_id ? "[" : "", trace_clocks[i].name,
4046 i == trace_clock_id ? "]" : ""); 4441 i == tr->clock_id ? "]" : "");
4047 seq_putc(m, '\n'); 4442 seq_putc(m, '\n');
4048 4443
4049 return 0; 4444 return 0;
@@ -4052,6 +4447,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4052static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4447static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4053 size_t cnt, loff_t *fpos) 4448 size_t cnt, loff_t *fpos)
4054{ 4449{
4450 struct seq_file *m = filp->private_data;
4451 struct trace_array *tr = m->private;
4055 char buf[64]; 4452 char buf[64];
4056 const char *clockstr; 4453 const char *clockstr;
4057 int i; 4454 int i;
@@ -4073,20 +4470,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4073 if (i == ARRAY_SIZE(trace_clocks)) 4470 if (i == ARRAY_SIZE(trace_clocks))
4074 return -EINVAL; 4471 return -EINVAL;
4075 4472
4076 trace_clock_id = i;
4077
4078 mutex_lock(&trace_types_lock); 4473 mutex_lock(&trace_types_lock);
4079 4474
4080 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); 4475 tr->clock_id = i;
4081 if (max_tr.buffer) 4476
4082 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4477 ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
4083 4478
4084 /* 4479 /*
4085 * New clock may not be consistent with the previous clock. 4480 * New clock may not be consistent with the previous clock.
4086 * Reset the buffer so that it doesn't have incomparable timestamps. 4481 * Reset the buffer so that it doesn't have incomparable timestamps.
4087 */ 4482 */
4088 tracing_reset_online_cpus(&global_trace); 4483 tracing_reset_online_cpus(&global_trace.trace_buffer);
4089 tracing_reset_online_cpus(&max_tr); 4484
4485#ifdef CONFIG_TRACER_MAX_TRACE
4486 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
4487 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
4488 tracing_reset_online_cpus(&global_trace.max_buffer);
4489#endif
4090 4490
4091 mutex_unlock(&trace_types_lock); 4491 mutex_unlock(&trace_types_lock);
4092 4492
@@ -4099,20 +4499,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4099{ 4499{
4100 if (tracing_disabled) 4500 if (tracing_disabled)
4101 return -ENODEV; 4501 return -ENODEV;
4102 return single_open(file, tracing_clock_show, NULL); 4502
4503 return single_open(file, tracing_clock_show, inode->i_private);
4103} 4504}
4104 4505
4506struct ftrace_buffer_info {
4507 struct trace_iterator iter;
4508 void *spare;
4509 unsigned int read;
4510};
4511
4105#ifdef CONFIG_TRACER_SNAPSHOT 4512#ifdef CONFIG_TRACER_SNAPSHOT
4106static int tracing_snapshot_open(struct inode *inode, struct file *file) 4513static int tracing_snapshot_open(struct inode *inode, struct file *file)
4107{ 4514{
4515 struct trace_cpu *tc = inode->i_private;
4108 struct trace_iterator *iter; 4516 struct trace_iterator *iter;
4517 struct seq_file *m;
4109 int ret = 0; 4518 int ret = 0;
4110 4519
4111 if (file->f_mode & FMODE_READ) { 4520 if (file->f_mode & FMODE_READ) {
4112 iter = __tracing_open(inode, file, true); 4521 iter = __tracing_open(inode, file, true);
4113 if (IS_ERR(iter)) 4522 if (IS_ERR(iter))
4114 ret = PTR_ERR(iter); 4523 ret = PTR_ERR(iter);
4524 } else {
4525 /* Writes still need the seq_file to hold the private data */
4526 m = kzalloc(sizeof(*m), GFP_KERNEL);
4527 if (!m)
4528 return -ENOMEM;
4529 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4530 if (!iter) {
4531 kfree(m);
4532 return -ENOMEM;
4533 }
4534 iter->tr = tc->tr;
4535 iter->trace_buffer = &tc->tr->max_buffer;
4536 iter->cpu_file = tc->cpu;
4537 m->private = iter;
4538 file->private_data = m;
4115 } 4539 }
4540
4116 return ret; 4541 return ret;
4117} 4542}
4118 4543
@@ -4120,6 +4545,9 @@ static ssize_t
4120tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, 4545tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4121 loff_t *ppos) 4546 loff_t *ppos)
4122{ 4547{
4548 struct seq_file *m = filp->private_data;
4549 struct trace_iterator *iter = m->private;
4550 struct trace_array *tr = iter->tr;
4123 unsigned long val; 4551 unsigned long val;
4124 int ret; 4552 int ret;
4125 4553
@@ -4133,40 +4561,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4133 4561
4134 mutex_lock(&trace_types_lock); 4562 mutex_lock(&trace_types_lock);
4135 4563
4136 if (current_trace->use_max_tr) { 4564 if (tr->current_trace->use_max_tr) {
4137 ret = -EBUSY; 4565 ret = -EBUSY;
4138 goto out; 4566 goto out;
4139 } 4567 }
4140 4568
4141 switch (val) { 4569 switch (val) {
4142 case 0: 4570 case 0:
4143 if (current_trace->allocated_snapshot) { 4571 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4144 /* free spare buffer */ 4572 ret = -EINVAL;
4145 ring_buffer_resize(max_tr.buffer, 1, 4573 break;
4146 RING_BUFFER_ALL_CPUS);
4147 set_buffer_entries(&max_tr, 1);
4148 tracing_reset_online_cpus(&max_tr);
4149 current_trace->allocated_snapshot = false;
4150 } 4574 }
4575 if (tr->allocated_snapshot)
4576 free_snapshot(tr);
4151 break; 4577 break;
4152 case 1: 4578 case 1:
4153 if (!current_trace->allocated_snapshot) { 4579/* Only allow per-cpu swap if the ring buffer supports it */
4154 /* allocate spare buffer */ 4580#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
4155 ret = resize_buffer_duplicate_size(&max_tr, 4581 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4156 &global_trace, RING_BUFFER_ALL_CPUS); 4582 ret = -EINVAL;
4583 break;
4584 }
4585#endif
4586 if (!tr->allocated_snapshot) {
4587 ret = alloc_snapshot(tr);
4157 if (ret < 0) 4588 if (ret < 0)
4158 break; 4589 break;
4159 current_trace->allocated_snapshot = true;
4160 } 4590 }
4161
4162 local_irq_disable(); 4591 local_irq_disable();
4163 /* Now, we're going to swap */ 4592 /* Now, we're going to swap */
4164 update_max_tr(&global_trace, current, smp_processor_id()); 4593 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4594 update_max_tr(tr, current, smp_processor_id());
4595 else
4596 update_max_tr_single(tr, current, iter->cpu_file);
4165 local_irq_enable(); 4597 local_irq_enable();
4166 break; 4598 break;
4167 default: 4599 default:
4168 if (current_trace->allocated_snapshot) 4600 if (tr->allocated_snapshot) {
4169 tracing_reset_online_cpus(&max_tr); 4601 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4602 tracing_reset_online_cpus(&tr->max_buffer);
4603 else
4604 tracing_reset(&tr->max_buffer, iter->cpu_file);
4605 }
4170 break; 4606 break;
4171 } 4607 }
4172 4608
@@ -4178,6 +4614,51 @@ out:
4178 mutex_unlock(&trace_types_lock); 4614 mutex_unlock(&trace_types_lock);
4179 return ret; 4615 return ret;
4180} 4616}
4617
4618static int tracing_snapshot_release(struct inode *inode, struct file *file)
4619{
4620 struct seq_file *m = file->private_data;
4621
4622 if (file->f_mode & FMODE_READ)
4623 return tracing_release(inode, file);
4624
4625 /* If write only, the seq_file is just a stub */
4626 if (m)
4627 kfree(m->private);
4628 kfree(m);
4629
4630 return 0;
4631}
4632
4633static int tracing_buffers_open(struct inode *inode, struct file *filp);
4634static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
4635 size_t count, loff_t *ppos);
4636static int tracing_buffers_release(struct inode *inode, struct file *file);
4637static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4638 struct pipe_inode_info *pipe, size_t len, unsigned int flags);
4639
4640static int snapshot_raw_open(struct inode *inode, struct file *filp)
4641{
4642 struct ftrace_buffer_info *info;
4643 int ret;
4644
4645 ret = tracing_buffers_open(inode, filp);
4646 if (ret < 0)
4647 return ret;
4648
4649 info = filp->private_data;
4650
4651 if (info->iter.trace->use_max_tr) {
4652 tracing_buffers_release(inode, filp);
4653 return -EBUSY;
4654 }
4655
4656 info->iter.snapshot = true;
4657 info->iter.trace_buffer = &info->iter.tr->max_buffer;
4658
4659 return ret;
4660}
4661
4181#endif /* CONFIG_TRACER_SNAPSHOT */ 4662#endif /* CONFIG_TRACER_SNAPSHOT */
4182 4663
4183 4664
@@ -4205,10 +4686,9 @@ static const struct file_operations tracing_pipe_fops = {
4205}; 4686};
4206 4687
4207static const struct file_operations tracing_entries_fops = { 4688static const struct file_operations tracing_entries_fops = {
4208 .open = tracing_entries_open, 4689 .open = tracing_open_generic,
4209 .read = tracing_entries_read, 4690 .read = tracing_entries_read,
4210 .write = tracing_entries_write, 4691 .write = tracing_entries_write,
4211 .release = tracing_entries_release,
4212 .llseek = generic_file_llseek, 4692 .llseek = generic_file_llseek,
4213}; 4693};
4214 4694
@@ -4243,20 +4723,23 @@ static const struct file_operations snapshot_fops = {
4243 .read = seq_read, 4723 .read = seq_read,
4244 .write = tracing_snapshot_write, 4724 .write = tracing_snapshot_write,
4245 .llseek = tracing_seek, 4725 .llseek = tracing_seek,
4246 .release = tracing_release, 4726 .release = tracing_snapshot_release,
4247}; 4727};
4248#endif /* CONFIG_TRACER_SNAPSHOT */
4249 4728
4250struct ftrace_buffer_info { 4729static const struct file_operations snapshot_raw_fops = {
4251 struct trace_array *tr; 4730 .open = snapshot_raw_open,
4252 void *spare; 4731 .read = tracing_buffers_read,
4253 int cpu; 4732 .release = tracing_buffers_release,
4254 unsigned int read; 4733 .splice_read = tracing_buffers_splice_read,
4734 .llseek = no_llseek,
4255}; 4735};
4256 4736
4737#endif /* CONFIG_TRACER_SNAPSHOT */
4738
4257static int tracing_buffers_open(struct inode *inode, struct file *filp) 4739static int tracing_buffers_open(struct inode *inode, struct file *filp)
4258{ 4740{
4259 int cpu = (int)(long)inode->i_private; 4741 struct trace_cpu *tc = inode->i_private;
4742 struct trace_array *tr = tc->tr;
4260 struct ftrace_buffer_info *info; 4743 struct ftrace_buffer_info *info;
4261 4744
4262 if (tracing_disabled) 4745 if (tracing_disabled)
@@ -4266,72 +4749,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4266 if (!info) 4749 if (!info)
4267 return -ENOMEM; 4750 return -ENOMEM;
4268 4751
4269 info->tr = &global_trace; 4752 mutex_lock(&trace_types_lock);
4270 info->cpu = cpu; 4753
4271 info->spare = NULL; 4754 tr->ref++;
4755
4756 info->iter.tr = tr;
4757 info->iter.cpu_file = tc->cpu;
4758 info->iter.trace = tr->current_trace;
4759 info->iter.trace_buffer = &tr->trace_buffer;
4760 info->spare = NULL;
4272 /* Force reading ring buffer for first read */ 4761 /* Force reading ring buffer for first read */
4273 info->read = (unsigned int)-1; 4762 info->read = (unsigned int)-1;
4274 4763
4275 filp->private_data = info; 4764 filp->private_data = info;
4276 4765
4766 mutex_unlock(&trace_types_lock);
4767
4277 return nonseekable_open(inode, filp); 4768 return nonseekable_open(inode, filp);
4278} 4769}
4279 4770
4771static unsigned int
4772tracing_buffers_poll(struct file *filp, poll_table *poll_table)
4773{
4774 struct ftrace_buffer_info *info = filp->private_data;
4775 struct trace_iterator *iter = &info->iter;
4776
4777 return trace_poll(iter, filp, poll_table);
4778}
4779
4280static ssize_t 4780static ssize_t
4281tracing_buffers_read(struct file *filp, char __user *ubuf, 4781tracing_buffers_read(struct file *filp, char __user *ubuf,
4282 size_t count, loff_t *ppos) 4782 size_t count, loff_t *ppos)
4283{ 4783{
4284 struct ftrace_buffer_info *info = filp->private_data; 4784 struct ftrace_buffer_info *info = filp->private_data;
4785 struct trace_iterator *iter = &info->iter;
4285 ssize_t ret; 4786 ssize_t ret;
4286 size_t size; 4787 ssize_t size;
4287 4788
4288 if (!count) 4789 if (!count)
4289 return 0; 4790 return 0;
4290 4791
4792 mutex_lock(&trace_types_lock);
4793
4794#ifdef CONFIG_TRACER_MAX_TRACE
4795 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4796 size = -EBUSY;
4797 goto out_unlock;
4798 }
4799#endif
4800
4291 if (!info->spare) 4801 if (!info->spare)
4292 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); 4802 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
4803 iter->cpu_file);
4804 size = -ENOMEM;
4293 if (!info->spare) 4805 if (!info->spare)
4294 return -ENOMEM; 4806 goto out_unlock;
4295 4807
4296 /* Do we have previous read data to read? */ 4808 /* Do we have previous read data to read? */
4297 if (info->read < PAGE_SIZE) 4809 if (info->read < PAGE_SIZE)
4298 goto read; 4810 goto read;
4299 4811
4300 trace_access_lock(info->cpu); 4812 again:
4301 ret = ring_buffer_read_page(info->tr->buffer, 4813 trace_access_lock(iter->cpu_file);
4814 ret = ring_buffer_read_page(iter->trace_buffer->buffer,
4302 &info->spare, 4815 &info->spare,
4303 count, 4816 count,
4304 info->cpu, 0); 4817 iter->cpu_file, 0);
4305 trace_access_unlock(info->cpu); 4818 trace_access_unlock(iter->cpu_file);
4306 if (ret < 0)
4307 return 0;
4308 4819
4309 info->read = 0; 4820 if (ret < 0) {
4821 if (trace_empty(iter)) {
4822 if ((filp->f_flags & O_NONBLOCK)) {
4823 size = -EAGAIN;
4824 goto out_unlock;
4825 }
4826 mutex_unlock(&trace_types_lock);
4827 iter->trace->wait_pipe(iter);
4828 mutex_lock(&trace_types_lock);
4829 if (signal_pending(current)) {
4830 size = -EINTR;
4831 goto out_unlock;
4832 }
4833 goto again;
4834 }
4835 size = 0;
4836 goto out_unlock;
4837 }
4310 4838
4311read: 4839 info->read = 0;
4840 read:
4312 size = PAGE_SIZE - info->read; 4841 size = PAGE_SIZE - info->read;
4313 if (size > count) 4842 if (size > count)
4314 size = count; 4843 size = count;
4315 4844
4316 ret = copy_to_user(ubuf, info->spare + info->read, size); 4845 ret = copy_to_user(ubuf, info->spare + info->read, size);
4317 if (ret == size) 4846 if (ret == size) {
4318 return -EFAULT; 4847 size = -EFAULT;
4848 goto out_unlock;
4849 }
4319 size -= ret; 4850 size -= ret;
4320 4851
4321 *ppos += size; 4852 *ppos += size;
4322 info->read += size; 4853 info->read += size;
4323 4854
4855 out_unlock:
4856 mutex_unlock(&trace_types_lock);
4857
4324 return size; 4858 return size;
4325} 4859}
4326 4860
4327static int tracing_buffers_release(struct inode *inode, struct file *file) 4861static int tracing_buffers_release(struct inode *inode, struct file *file)
4328{ 4862{
4329 struct ftrace_buffer_info *info = file->private_data; 4863 struct ftrace_buffer_info *info = file->private_data;
4864 struct trace_iterator *iter = &info->iter;
4865
4866 mutex_lock(&trace_types_lock);
4867
4868 WARN_ON(!iter->tr->ref);
4869 iter->tr->ref--;
4330 4870
4331 if (info->spare) 4871 if (info->spare)
4332 ring_buffer_free_read_page(info->tr->buffer, info->spare); 4872 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
4333 kfree(info); 4873 kfree(info);
4334 4874
4875 mutex_unlock(&trace_types_lock);
4876
4335 return 0; 4877 return 0;
4336} 4878}
4337 4879
@@ -4396,6 +4938,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4396 unsigned int flags) 4938 unsigned int flags)
4397{ 4939{
4398 struct ftrace_buffer_info *info = file->private_data; 4940 struct ftrace_buffer_info *info = file->private_data;
4941 struct trace_iterator *iter = &info->iter;
4399 struct partial_page partial_def[PIPE_DEF_BUFFERS]; 4942 struct partial_page partial_def[PIPE_DEF_BUFFERS];
4400 struct page *pages_def[PIPE_DEF_BUFFERS]; 4943 struct page *pages_def[PIPE_DEF_BUFFERS];
4401 struct splice_pipe_desc spd = { 4944 struct splice_pipe_desc spd = {
@@ -4408,10 +4951,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4408 }; 4951 };
4409 struct buffer_ref *ref; 4952 struct buffer_ref *ref;
4410 int entries, size, i; 4953 int entries, size, i;
4411 size_t ret; 4954 ssize_t ret;
4412 4955
4413 if (splice_grow_spd(pipe, &spd)) 4956 mutex_lock(&trace_types_lock);
4414 return -ENOMEM; 4957
4958#ifdef CONFIG_TRACER_MAX_TRACE
4959 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4960 ret = -EBUSY;
4961 goto out;
4962 }
4963#endif
4964
4965 if (splice_grow_spd(pipe, &spd)) {
4966 ret = -ENOMEM;
4967 goto out;
4968 }
4415 4969
4416 if (*ppos & (PAGE_SIZE - 1)) { 4970 if (*ppos & (PAGE_SIZE - 1)) {
4417 ret = -EINVAL; 4971 ret = -EINVAL;
@@ -4426,8 +4980,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4426 len &= PAGE_MASK; 4980 len &= PAGE_MASK;
4427 } 4981 }
4428 4982
4429 trace_access_lock(info->cpu); 4983 again:
4430 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 4984 trace_access_lock(iter->cpu_file);
4985 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4431 4986
4432 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { 4987 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
4433 struct page *page; 4988 struct page *page;
@@ -4438,15 +4993,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4438 break; 4993 break;
4439 4994
4440 ref->ref = 1; 4995 ref->ref = 1;
4441 ref->buffer = info->tr->buffer; 4996 ref->buffer = iter->trace_buffer->buffer;
4442 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); 4997 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
4443 if (!ref->page) { 4998 if (!ref->page) {
4444 kfree(ref); 4999 kfree(ref);
4445 break; 5000 break;
4446 } 5001 }
4447 5002
4448 r = ring_buffer_read_page(ref->buffer, &ref->page, 5003 r = ring_buffer_read_page(ref->buffer, &ref->page,
4449 len, info->cpu, 1); 5004 len, iter->cpu_file, 1);
4450 if (r < 0) { 5005 if (r < 0) {
4451 ring_buffer_free_read_page(ref->buffer, ref->page); 5006 ring_buffer_free_read_page(ref->buffer, ref->page);
4452 kfree(ref); 5007 kfree(ref);
@@ -4470,31 +5025,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4470 spd.nr_pages++; 5025 spd.nr_pages++;
4471 *ppos += PAGE_SIZE; 5026 *ppos += PAGE_SIZE;
4472 5027
4473 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 5028 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4474 } 5029 }
4475 5030
4476 trace_access_unlock(info->cpu); 5031 trace_access_unlock(iter->cpu_file);
4477 spd.nr_pages = i; 5032 spd.nr_pages = i;
4478 5033
4479 /* did we read anything? */ 5034 /* did we read anything? */
4480 if (!spd.nr_pages) { 5035 if (!spd.nr_pages) {
4481 if (flags & SPLICE_F_NONBLOCK) 5036 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
4482 ret = -EAGAIN; 5037 ret = -EAGAIN;
4483 else 5038 goto out;
4484 ret = 0; 5039 }
4485 /* TODO: block */ 5040 mutex_unlock(&trace_types_lock);
4486 goto out; 5041 iter->trace->wait_pipe(iter);
5042 mutex_lock(&trace_types_lock);
5043 if (signal_pending(current)) {
5044 ret = -EINTR;
5045 goto out;
5046 }
5047 goto again;
4487 } 5048 }
4488 5049
4489 ret = splice_to_pipe(pipe, &spd); 5050 ret = splice_to_pipe(pipe, &spd);
4490 splice_shrink_spd(&spd); 5051 splice_shrink_spd(&spd);
4491out: 5052out:
5053 mutex_unlock(&trace_types_lock);
5054
4492 return ret; 5055 return ret;
4493} 5056}
4494 5057
4495static const struct file_operations tracing_buffers_fops = { 5058static const struct file_operations tracing_buffers_fops = {
4496 .open = tracing_buffers_open, 5059 .open = tracing_buffers_open,
4497 .read = tracing_buffers_read, 5060 .read = tracing_buffers_read,
5061 .poll = tracing_buffers_poll,
4498 .release = tracing_buffers_release, 5062 .release = tracing_buffers_release,
4499 .splice_read = tracing_buffers_splice_read, 5063 .splice_read = tracing_buffers_splice_read,
4500 .llseek = no_llseek, 5064 .llseek = no_llseek,
@@ -4504,12 +5068,14 @@ static ssize_t
4504tracing_stats_read(struct file *filp, char __user *ubuf, 5068tracing_stats_read(struct file *filp, char __user *ubuf,
4505 size_t count, loff_t *ppos) 5069 size_t count, loff_t *ppos)
4506{ 5070{
4507 unsigned long cpu = (unsigned long)filp->private_data; 5071 struct trace_cpu *tc = filp->private_data;
4508 struct trace_array *tr = &global_trace; 5072 struct trace_array *tr = tc->tr;
5073 struct trace_buffer *trace_buf = &tr->trace_buffer;
4509 struct trace_seq *s; 5074 struct trace_seq *s;
4510 unsigned long cnt; 5075 unsigned long cnt;
4511 unsigned long long t; 5076 unsigned long long t;
4512 unsigned long usec_rem; 5077 unsigned long usec_rem;
5078 int cpu = tc->cpu;
4513 5079
4514 s = kmalloc(sizeof(*s), GFP_KERNEL); 5080 s = kmalloc(sizeof(*s), GFP_KERNEL);
4515 if (!s) 5081 if (!s)
@@ -4517,41 +5083,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4517 5083
4518 trace_seq_init(s); 5084 trace_seq_init(s);
4519 5085
4520 cnt = ring_buffer_entries_cpu(tr->buffer, cpu); 5086 cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
4521 trace_seq_printf(s, "entries: %ld\n", cnt); 5087 trace_seq_printf(s, "entries: %ld\n", cnt);
4522 5088
4523 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); 5089 cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
4524 trace_seq_printf(s, "overrun: %ld\n", cnt); 5090 trace_seq_printf(s, "overrun: %ld\n", cnt);
4525 5091
4526 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 5092 cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
4527 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 5093 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4528 5094
4529 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 5095 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
4530 trace_seq_printf(s, "bytes: %ld\n", cnt); 5096 trace_seq_printf(s, "bytes: %ld\n", cnt);
4531 5097
4532 if (trace_clocks[trace_clock_id].in_ns) { 5098 if (trace_clocks[trace_clock_id].in_ns) {
4533 /* local or global for trace_clock */ 5099 /* local or global for trace_clock */
4534 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5100 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4535 usec_rem = do_div(t, USEC_PER_SEC); 5101 usec_rem = do_div(t, USEC_PER_SEC);
4536 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", 5102 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4537 t, usec_rem); 5103 t, usec_rem);
4538 5104
4539 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 5105 t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
4540 usec_rem = do_div(t, USEC_PER_SEC); 5106 usec_rem = do_div(t, USEC_PER_SEC);
4541 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); 5107 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4542 } else { 5108 } else {
4543 /* counter or tsc mode for trace_clock */ 5109 /* counter or tsc mode for trace_clock */
4544 trace_seq_printf(s, "oldest event ts: %llu\n", 5110 trace_seq_printf(s, "oldest event ts: %llu\n",
4545 ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5111 ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4546 5112
4547 trace_seq_printf(s, "now ts: %llu\n", 5113 trace_seq_printf(s, "now ts: %llu\n",
4548 ring_buffer_time_stamp(tr->buffer, cpu)); 5114 ring_buffer_time_stamp(trace_buf->buffer, cpu));
4549 } 5115 }
4550 5116
4551 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 5117 cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
4552 trace_seq_printf(s, "dropped events: %ld\n", cnt); 5118 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4553 5119
4554 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); 5120 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
4555 trace_seq_printf(s, "read events: %ld\n", cnt); 5121 trace_seq_printf(s, "read events: %ld\n", cnt);
4556 5122
4557 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5123 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4603,60 +5169,161 @@ static const struct file_operations tracing_dyn_info_fops = {
4603 .read = tracing_read_dyn_info, 5169 .read = tracing_read_dyn_info,
4604 .llseek = generic_file_llseek, 5170 .llseek = generic_file_llseek,
4605}; 5171};
4606#endif 5172#endif /* CONFIG_DYNAMIC_FTRACE */
4607 5173
4608static struct dentry *d_tracer; 5174#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
5175static void
5176ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5177{
5178 tracing_snapshot();
5179}
4609 5180
4610struct dentry *tracing_init_dentry(void) 5181static void
5182ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5183{
5184 unsigned long *count = (long *)data;
5185
5186 if (!*count)
5187 return;
5188
5189 if (*count != -1)
5190 (*count)--;
5191
5192 tracing_snapshot();
5193}
5194
5195static int
5196ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5197 struct ftrace_probe_ops *ops, void *data)
5198{
5199 long count = (long)data;
5200
5201 seq_printf(m, "%ps:", (void *)ip);
5202
5203 seq_printf(m, "snapshot");
5204
5205 if (count == -1)
5206 seq_printf(m, ":unlimited\n");
5207 else
5208 seq_printf(m, ":count=%ld\n", count);
5209
5210 return 0;
5211}
5212
5213static struct ftrace_probe_ops snapshot_probe_ops = {
5214 .func = ftrace_snapshot,
5215 .print = ftrace_snapshot_print,
5216};
5217
5218static struct ftrace_probe_ops snapshot_count_probe_ops = {
5219 .func = ftrace_count_snapshot,
5220 .print = ftrace_snapshot_print,
5221};
5222
5223static int
5224ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
5225 char *glob, char *cmd, char *param, int enable)
5226{
5227 struct ftrace_probe_ops *ops;
5228 void *count = (void *)-1;
5229 char *number;
5230 int ret;
5231
5232 /* hash funcs only work with set_ftrace_filter */
5233 if (!enable)
5234 return -EINVAL;
5235
5236 ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
5237
5238 if (glob[0] == '!') {
5239 unregister_ftrace_function_probe_func(glob+1, ops);
5240 return 0;
5241 }
5242
5243 if (!param)
5244 goto out_reg;
5245
5246 number = strsep(&param, ":");
5247
5248 if (!strlen(number))
5249 goto out_reg;
5250
5251 /*
5252 * We use the callback data field (which is a pointer)
5253 * as our counter.
5254 */
5255 ret = kstrtoul(number, 0, (unsigned long *)&count);
5256 if (ret)
5257 return ret;
5258
5259 out_reg:
5260 ret = register_ftrace_function_probe(glob, ops, count);
5261
5262 if (ret >= 0)
5263 alloc_snapshot(&global_trace);
5264
5265 return ret < 0 ? ret : 0;
5266}
5267
5268static struct ftrace_func_command ftrace_snapshot_cmd = {
5269 .name = "snapshot",
5270 .func = ftrace_trace_snapshot_callback,
5271};
5272
5273static int register_snapshot_cmd(void)
4611{ 5274{
4612 static int once; 5275 return register_ftrace_command(&ftrace_snapshot_cmd);
5276}
5277#else
5278static inline int register_snapshot_cmd(void) { return 0; }
5279#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
4613 5280
4614 if (d_tracer) 5281struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
4615 return d_tracer; 5282{
5283 if (tr->dir)
5284 return tr->dir;
4616 5285
4617 if (!debugfs_initialized()) 5286 if (!debugfs_initialized())
4618 return NULL; 5287 return NULL;
4619 5288
4620 d_tracer = debugfs_create_dir("tracing", NULL); 5289 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5290 tr->dir = debugfs_create_dir("tracing", NULL);
4621 5291
4622 if (!d_tracer && !once) { 5292 if (!tr->dir)
4623 once = 1; 5293 pr_warn_once("Could not create debugfs directory 'tracing'\n");
4624 pr_warning("Could not create debugfs directory 'tracing'\n");
4625 return NULL;
4626 }
4627 5294
4628 return d_tracer; 5295 return tr->dir;
4629} 5296}
4630 5297
4631static struct dentry *d_percpu; 5298struct dentry *tracing_init_dentry(void)
5299{
5300 return tracing_init_dentry_tr(&global_trace);
5301}
4632 5302
4633static struct dentry *tracing_dentry_percpu(void) 5303static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
4634{ 5304{
4635 static int once;
4636 struct dentry *d_tracer; 5305 struct dentry *d_tracer;
4637 5306
4638 if (d_percpu) 5307 if (tr->percpu_dir)
4639 return d_percpu; 5308 return tr->percpu_dir;
4640
4641 d_tracer = tracing_init_dentry();
4642 5309
5310 d_tracer = tracing_init_dentry_tr(tr);
4643 if (!d_tracer) 5311 if (!d_tracer)
4644 return NULL; 5312 return NULL;
4645 5313
4646 d_percpu = debugfs_create_dir("per_cpu", d_tracer); 5314 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
4647 5315
4648 if (!d_percpu && !once) { 5316 WARN_ONCE(!tr->percpu_dir,
4649 once = 1; 5317 "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
4650 pr_warning("Could not create debugfs directory 'per_cpu'\n");
4651 return NULL;
4652 }
4653 5318
4654 return d_percpu; 5319 return tr->percpu_dir;
4655} 5320}
4656 5321
4657static void tracing_init_debugfs_percpu(long cpu) 5322static void
5323tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
4658{ 5324{
4659 struct dentry *d_percpu = tracing_dentry_percpu(); 5325 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
5326 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
4660 struct dentry *d_cpu; 5327 struct dentry *d_cpu;
4661 char cpu_dir[30]; /* 30 characters should be more than enough */ 5328 char cpu_dir[30]; /* 30 characters should be more than enough */
4662 5329
@@ -4672,20 +5339,28 @@ static void tracing_init_debugfs_percpu(long cpu)
4672 5339
4673 /* per cpu trace_pipe */ 5340 /* per cpu trace_pipe */
4674 trace_create_file("trace_pipe", 0444, d_cpu, 5341 trace_create_file("trace_pipe", 0444, d_cpu,
4675 (void *) cpu, &tracing_pipe_fops); 5342 (void *)&data->trace_cpu, &tracing_pipe_fops);
4676 5343
4677 /* per cpu trace */ 5344 /* per cpu trace */
4678 trace_create_file("trace", 0644, d_cpu, 5345 trace_create_file("trace", 0644, d_cpu,
4679 (void *) cpu, &tracing_fops); 5346 (void *)&data->trace_cpu, &tracing_fops);
4680 5347
4681 trace_create_file("trace_pipe_raw", 0444, d_cpu, 5348 trace_create_file("trace_pipe_raw", 0444, d_cpu,
4682 (void *) cpu, &tracing_buffers_fops); 5349 (void *)&data->trace_cpu, &tracing_buffers_fops);
4683 5350
4684 trace_create_file("stats", 0444, d_cpu, 5351 trace_create_file("stats", 0444, d_cpu,
4685 (void *) cpu, &tracing_stats_fops); 5352 (void *)&data->trace_cpu, &tracing_stats_fops);
4686 5353
4687 trace_create_file("buffer_size_kb", 0444, d_cpu, 5354 trace_create_file("buffer_size_kb", 0444, d_cpu,
4688 (void *) cpu, &tracing_entries_fops); 5355 (void *)&data->trace_cpu, &tracing_entries_fops);
5356
5357#ifdef CONFIG_TRACER_SNAPSHOT
5358 trace_create_file("snapshot", 0644, d_cpu,
5359 (void *)&data->trace_cpu, &snapshot_fops);
5360
5361 trace_create_file("snapshot_raw", 0444, d_cpu,
5362 (void *)&data->trace_cpu, &snapshot_raw_fops);
5363#endif
4689} 5364}
4690 5365
4691#ifdef CONFIG_FTRACE_SELFTEST 5366#ifdef CONFIG_FTRACE_SELFTEST
@@ -4696,6 +5371,7 @@ static void tracing_init_debugfs_percpu(long cpu)
4696struct trace_option_dentry { 5371struct trace_option_dentry {
4697 struct tracer_opt *opt; 5372 struct tracer_opt *opt;
4698 struct tracer_flags *flags; 5373 struct tracer_flags *flags;
5374 struct trace_array *tr;
4699 struct dentry *entry; 5375 struct dentry *entry;
4700}; 5376};
4701 5377
@@ -4731,7 +5407,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4731 5407
4732 if (!!(topt->flags->val & topt->opt->bit) != val) { 5408 if (!!(topt->flags->val & topt->opt->bit) != val) {
4733 mutex_lock(&trace_types_lock); 5409 mutex_lock(&trace_types_lock);
4734 ret = __set_tracer_option(current_trace, topt->flags, 5410 ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
4735 topt->opt, !val); 5411 topt->opt, !val);
4736 mutex_unlock(&trace_types_lock); 5412 mutex_unlock(&trace_types_lock);
4737 if (ret) 5413 if (ret)
@@ -4770,6 +5446,7 @@ static ssize_t
4770trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, 5446trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4771 loff_t *ppos) 5447 loff_t *ppos)
4772{ 5448{
5449 struct trace_array *tr = &global_trace;
4773 long index = (long)filp->private_data; 5450 long index = (long)filp->private_data;
4774 unsigned long val; 5451 unsigned long val;
4775 int ret; 5452 int ret;
@@ -4780,7 +5457,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4780 5457
4781 if (val != 0 && val != 1) 5458 if (val != 0 && val != 1)
4782 return -EINVAL; 5459 return -EINVAL;
4783 set_tracer_flags(1 << index, val); 5460
5461 mutex_lock(&trace_types_lock);
5462 ret = set_tracer_flag(tr, 1 << index, val);
5463 mutex_unlock(&trace_types_lock);
5464
5465 if (ret < 0)
5466 return ret;
4784 5467
4785 *ppos += cnt; 5468 *ppos += cnt;
4786 5469
@@ -4810,40 +5493,41 @@ struct dentry *trace_create_file(const char *name,
4810} 5493}
4811 5494
4812 5495
4813static struct dentry *trace_options_init_dentry(void) 5496static struct dentry *trace_options_init_dentry(struct trace_array *tr)
4814{ 5497{
4815 struct dentry *d_tracer; 5498 struct dentry *d_tracer;
4816 static struct dentry *t_options;
4817 5499
4818 if (t_options) 5500 if (tr->options)
4819 return t_options; 5501 return tr->options;
4820 5502
4821 d_tracer = tracing_init_dentry(); 5503 d_tracer = tracing_init_dentry_tr(tr);
4822 if (!d_tracer) 5504 if (!d_tracer)
4823 return NULL; 5505 return NULL;
4824 5506
4825 t_options = debugfs_create_dir("options", d_tracer); 5507 tr->options = debugfs_create_dir("options", d_tracer);
4826 if (!t_options) { 5508 if (!tr->options) {
4827 pr_warning("Could not create debugfs directory 'options'\n"); 5509 pr_warning("Could not create debugfs directory 'options'\n");
4828 return NULL; 5510 return NULL;
4829 } 5511 }
4830 5512
4831 return t_options; 5513 return tr->options;
4832} 5514}
4833 5515
4834static void 5516static void
4835create_trace_option_file(struct trace_option_dentry *topt, 5517create_trace_option_file(struct trace_array *tr,
5518 struct trace_option_dentry *topt,
4836 struct tracer_flags *flags, 5519 struct tracer_flags *flags,
4837 struct tracer_opt *opt) 5520 struct tracer_opt *opt)
4838{ 5521{
4839 struct dentry *t_options; 5522 struct dentry *t_options;
4840 5523
4841 t_options = trace_options_init_dentry(); 5524 t_options = trace_options_init_dentry(tr);
4842 if (!t_options) 5525 if (!t_options)
4843 return; 5526 return;
4844 5527
4845 topt->flags = flags; 5528 topt->flags = flags;
4846 topt->opt = opt; 5529 topt->opt = opt;
5530 topt->tr = tr;
4847 5531
4848 topt->entry = trace_create_file(opt->name, 0644, t_options, topt, 5532 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
4849 &trace_options_fops); 5533 &trace_options_fops);
@@ -4851,7 +5535,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
4851} 5535}
4852 5536
4853static struct trace_option_dentry * 5537static struct trace_option_dentry *
4854create_trace_option_files(struct tracer *tracer) 5538create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
4855{ 5539{
4856 struct trace_option_dentry *topts; 5540 struct trace_option_dentry *topts;
4857 struct tracer_flags *flags; 5541 struct tracer_flags *flags;
@@ -4876,7 +5560,7 @@ create_trace_option_files(struct tracer *tracer)
4876 return NULL; 5560 return NULL;
4877 5561
4878 for (cnt = 0; opts[cnt].name; cnt++) 5562 for (cnt = 0; opts[cnt].name; cnt++)
4879 create_trace_option_file(&topts[cnt], flags, 5563 create_trace_option_file(tr, &topts[cnt], flags,
4880 &opts[cnt]); 5564 &opts[cnt]);
4881 5565
4882 return topts; 5566 return topts;
@@ -4899,11 +5583,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
4899} 5583}
4900 5584
4901static struct dentry * 5585static struct dentry *
4902create_trace_option_core_file(const char *option, long index) 5586create_trace_option_core_file(struct trace_array *tr,
5587 const char *option, long index)
4903{ 5588{
4904 struct dentry *t_options; 5589 struct dentry *t_options;
4905 5590
4906 t_options = trace_options_init_dentry(); 5591 t_options = trace_options_init_dentry(tr);
4907 if (!t_options) 5592 if (!t_options)
4908 return NULL; 5593 return NULL;
4909 5594
@@ -4911,17 +5596,17 @@ create_trace_option_core_file(const char *option, long index)
4911 &trace_options_core_fops); 5596 &trace_options_core_fops);
4912} 5597}
4913 5598
4914static __init void create_trace_options_dir(void) 5599static __init void create_trace_options_dir(struct trace_array *tr)
4915{ 5600{
4916 struct dentry *t_options; 5601 struct dentry *t_options;
4917 int i; 5602 int i;
4918 5603
4919 t_options = trace_options_init_dentry(); 5604 t_options = trace_options_init_dentry(tr);
4920 if (!t_options) 5605 if (!t_options)
4921 return; 5606 return;
4922 5607
4923 for (i = 0; trace_options[i]; i++) 5608 for (i = 0; trace_options[i]; i++)
4924 create_trace_option_core_file(trace_options[i], i); 5609 create_trace_option_core_file(tr, trace_options[i], i);
4925} 5610}
4926 5611
4927static ssize_t 5612static ssize_t
@@ -4929,7 +5614,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
4929 size_t cnt, loff_t *ppos) 5614 size_t cnt, loff_t *ppos)
4930{ 5615{
4931 struct trace_array *tr = filp->private_data; 5616 struct trace_array *tr = filp->private_data;
4932 struct ring_buffer *buffer = tr->buffer; 5617 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4933 char buf[64]; 5618 char buf[64];
4934 int r; 5619 int r;
4935 5620
@@ -4948,7 +5633,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4948 size_t cnt, loff_t *ppos) 5633 size_t cnt, loff_t *ppos)
4949{ 5634{
4950 struct trace_array *tr = filp->private_data; 5635 struct trace_array *tr = filp->private_data;
4951 struct ring_buffer *buffer = tr->buffer; 5636 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4952 unsigned long val; 5637 unsigned long val;
4953 int ret; 5638 int ret;
4954 5639
@@ -4960,12 +5645,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4960 mutex_lock(&trace_types_lock); 5645 mutex_lock(&trace_types_lock);
4961 if (val) { 5646 if (val) {
4962 ring_buffer_record_on(buffer); 5647 ring_buffer_record_on(buffer);
4963 if (current_trace->start) 5648 if (tr->current_trace->start)
4964 current_trace->start(tr); 5649 tr->current_trace->start(tr);
4965 } else { 5650 } else {
4966 ring_buffer_record_off(buffer); 5651 ring_buffer_record_off(buffer);
4967 if (current_trace->stop) 5652 if (tr->current_trace->stop)
4968 current_trace->stop(tr); 5653 tr->current_trace->stop(tr);
4969 } 5654 }
4970 mutex_unlock(&trace_types_lock); 5655 mutex_unlock(&trace_types_lock);
4971 } 5656 }
@@ -4982,23 +5667,310 @@ static const struct file_operations rb_simple_fops = {
4982 .llseek = default_llseek, 5667 .llseek = default_llseek,
4983}; 5668};
4984 5669
5670struct dentry *trace_instance_dir;
5671
5672static void
5673init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
5674
5675static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
5676{
5677 int cpu;
5678
5679 for_each_tracing_cpu(cpu) {
5680 memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
5681 per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
5682 per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
5683 }
5684}
5685
5686static int
5687allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
5688{
5689 enum ring_buffer_flags rb_flags;
5690
5691 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5692
5693 buf->buffer = ring_buffer_alloc(size, rb_flags);
5694 if (!buf->buffer)
5695 return -ENOMEM;
5696
5697 buf->data = alloc_percpu(struct trace_array_cpu);
5698 if (!buf->data) {
5699 ring_buffer_free(buf->buffer);
5700 return -ENOMEM;
5701 }
5702
5703 init_trace_buffers(tr, buf);
5704
5705 /* Allocate the first page for all buffers */
5706 set_buffer_entries(&tr->trace_buffer,
5707 ring_buffer_size(tr->trace_buffer.buffer, 0));
5708
5709 return 0;
5710}
5711
5712static int allocate_trace_buffers(struct trace_array *tr, int size)
5713{
5714 int ret;
5715
5716 ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
5717 if (ret)
5718 return ret;
5719
5720#ifdef CONFIG_TRACER_MAX_TRACE
5721 ret = allocate_trace_buffer(tr, &tr->max_buffer,
5722 allocate_snapshot ? size : 1);
5723 if (WARN_ON(ret)) {
5724 ring_buffer_free(tr->trace_buffer.buffer);
5725 free_percpu(tr->trace_buffer.data);
5726 return -ENOMEM;
5727 }
5728 tr->allocated_snapshot = allocate_snapshot;
5729
5730 /*
5731 * Only the top level trace array gets its snapshot allocated
5732 * from the kernel command line.
5733 */
5734 allocate_snapshot = false;
5735#endif
5736 return 0;
5737}
5738
5739static int new_instance_create(const char *name)
5740{
5741 struct trace_array *tr;
5742 int ret;
5743
5744 mutex_lock(&trace_types_lock);
5745
5746 ret = -EEXIST;
5747 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5748 if (tr->name && strcmp(tr->name, name) == 0)
5749 goto out_unlock;
5750 }
5751
5752 ret = -ENOMEM;
5753 tr = kzalloc(sizeof(*tr), GFP_KERNEL);
5754 if (!tr)
5755 goto out_unlock;
5756
5757 tr->name = kstrdup(name, GFP_KERNEL);
5758 if (!tr->name)
5759 goto out_free_tr;
5760
5761 raw_spin_lock_init(&tr->start_lock);
5762
5763 tr->current_trace = &nop_trace;
5764
5765 INIT_LIST_HEAD(&tr->systems);
5766 INIT_LIST_HEAD(&tr->events);
5767
5768 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
5769 goto out_free_tr;
5770
5771 /* Holder for file callbacks */
5772 tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
5773 tr->trace_cpu.tr = tr;
5774
5775 tr->dir = debugfs_create_dir(name, trace_instance_dir);
5776 if (!tr->dir)
5777 goto out_free_tr;
5778
5779 ret = event_trace_add_tracer(tr->dir, tr);
5780 if (ret)
5781 goto out_free_tr;
5782
5783 init_tracer_debugfs(tr, tr->dir);
5784
5785 list_add(&tr->list, &ftrace_trace_arrays);
5786
5787 mutex_unlock(&trace_types_lock);
5788
5789 return 0;
5790
5791 out_free_tr:
5792 if (tr->trace_buffer.buffer)
5793 ring_buffer_free(tr->trace_buffer.buffer);
5794 kfree(tr->name);
5795 kfree(tr);
5796
5797 out_unlock:
5798 mutex_unlock(&trace_types_lock);
5799
5800 return ret;
5801
5802}
5803
5804static int instance_delete(const char *name)
5805{
5806 struct trace_array *tr;
5807 int found = 0;
5808 int ret;
5809
5810 mutex_lock(&trace_types_lock);
5811
5812 ret = -ENODEV;
5813 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5814 if (tr->name && strcmp(tr->name, name) == 0) {
5815 found = 1;
5816 break;
5817 }
5818 }
5819 if (!found)
5820 goto out_unlock;
5821
5822 ret = -EBUSY;
5823 if (tr->ref)
5824 goto out_unlock;
5825
5826 list_del(&tr->list);
5827
5828 event_trace_del_tracer(tr);
5829 debugfs_remove_recursive(tr->dir);
5830 free_percpu(tr->trace_buffer.data);
5831 ring_buffer_free(tr->trace_buffer.buffer);
5832
5833 kfree(tr->name);
5834 kfree(tr);
5835
5836 ret = 0;
5837
5838 out_unlock:
5839 mutex_unlock(&trace_types_lock);
5840
5841 return ret;
5842}
5843
5844static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
5845{
5846 struct dentry *parent;
5847 int ret;
5848
5849 /* Paranoid: Make sure the parent is the "instances" directory */
5850 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5851 if (WARN_ON_ONCE(parent != trace_instance_dir))
5852 return -ENOENT;
5853
5854 /*
5855 * The inode mutex is locked, but debugfs_create_dir() will also
5856 * take the mutex. As the instances directory can not be destroyed
5857 * or changed in any other way, it is safe to unlock it, and
5858 * let the dentry try. If two users try to make the same dir at
5859 * the same time, then the new_instance_create() will determine the
5860 * winner.
5861 */
5862 mutex_unlock(&inode->i_mutex);
5863
5864 ret = new_instance_create(dentry->d_iname);
5865
5866 mutex_lock(&inode->i_mutex);
5867
5868 return ret;
5869}
5870
5871static int instance_rmdir(struct inode *inode, struct dentry *dentry)
5872{
5873 struct dentry *parent;
5874 int ret;
5875
5876 /* Paranoid: Make sure the parent is the "instances" directory */
5877 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5878 if (WARN_ON_ONCE(parent != trace_instance_dir))
5879 return -ENOENT;
5880
5881 /* The caller did a dget() on dentry */
5882 mutex_unlock(&dentry->d_inode->i_mutex);
5883
5884 /*
5885 * The inode mutex is locked, but debugfs_create_dir() will also
5886 * take the mutex. As the instances directory can not be destroyed
5887 * or changed in any other way, it is safe to unlock it, and
5888 * let the dentry try. If two users try to make the same dir at
5889 * the same time, then the instance_delete() will determine the
5890 * winner.
5891 */
5892 mutex_unlock(&inode->i_mutex);
5893
5894 ret = instance_delete(dentry->d_iname);
5895
5896 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
5897 mutex_lock(&dentry->d_inode->i_mutex);
5898
5899 return ret;
5900}
5901
5902static const struct inode_operations instance_dir_inode_operations = {
5903 .lookup = simple_lookup,
5904 .mkdir = instance_mkdir,
5905 .rmdir = instance_rmdir,
5906};
5907
5908static __init void create_trace_instances(struct dentry *d_tracer)
5909{
5910 trace_instance_dir = debugfs_create_dir("instances", d_tracer);
5911 if (WARN_ON(!trace_instance_dir))
5912 return;
5913
5914 /* Hijack the dir inode operations, to allow mkdir */
5915 trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
5916}
5917
5918static void
5919init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5920{
5921 int cpu;
5922
5923 trace_create_file("trace_options", 0644, d_tracer,
5924 tr, &tracing_iter_fops);
5925
5926 trace_create_file("trace", 0644, d_tracer,
5927 (void *)&tr->trace_cpu, &tracing_fops);
5928
5929 trace_create_file("trace_pipe", 0444, d_tracer,
5930 (void *)&tr->trace_cpu, &tracing_pipe_fops);
5931
5932 trace_create_file("buffer_size_kb", 0644, d_tracer,
5933 (void *)&tr->trace_cpu, &tracing_entries_fops);
5934
5935 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5936 tr, &tracing_total_entries_fops);
5937
5938 trace_create_file("free_buffer", 0644, d_tracer,
5939 tr, &tracing_free_buffer_fops);
5940
5941 trace_create_file("trace_marker", 0220, d_tracer,
5942 tr, &tracing_mark_fops);
5943
5944 trace_create_file("trace_clock", 0644, d_tracer, tr,
5945 &trace_clock_fops);
5946
5947 trace_create_file("tracing_on", 0644, d_tracer,
5948 tr, &rb_simple_fops);
5949
5950#ifdef CONFIG_TRACER_SNAPSHOT
5951 trace_create_file("snapshot", 0644, d_tracer,
5952 (void *)&tr->trace_cpu, &snapshot_fops);
5953#endif
5954
5955 for_each_tracing_cpu(cpu)
5956 tracing_init_debugfs_percpu(tr, cpu);
5957
5958}
5959
4985static __init int tracer_init_debugfs(void) 5960static __init int tracer_init_debugfs(void)
4986{ 5961{
4987 struct dentry *d_tracer; 5962 struct dentry *d_tracer;
4988 int cpu;
4989 5963
4990 trace_access_lock_init(); 5964 trace_access_lock_init();
4991 5965
4992 d_tracer = tracing_init_dentry(); 5966 d_tracer = tracing_init_dentry();
5967 if (!d_tracer)
5968 return 0;
4993 5969
4994 trace_create_file("trace_options", 0644, d_tracer, 5970 init_tracer_debugfs(&global_trace, d_tracer);
4995 NULL, &tracing_iter_fops);
4996 5971
4997 trace_create_file("tracing_cpumask", 0644, d_tracer, 5972 trace_create_file("tracing_cpumask", 0644, d_tracer,
4998 NULL, &tracing_cpumask_fops); 5973 &global_trace, &tracing_cpumask_fops);
4999
5000 trace_create_file("trace", 0644, d_tracer,
5001 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
5002 5974
5003 trace_create_file("available_tracers", 0444, d_tracer, 5975 trace_create_file("available_tracers", 0444, d_tracer,
5004 &global_trace, &show_traces_fops); 5976 &global_trace, &show_traces_fops);
@@ -5017,44 +5989,17 @@ static __init int tracer_init_debugfs(void)
5017 trace_create_file("README", 0444, d_tracer, 5989 trace_create_file("README", 0444, d_tracer,
5018 NULL, &tracing_readme_fops); 5990 NULL, &tracing_readme_fops);
5019 5991
5020 trace_create_file("trace_pipe", 0444, d_tracer,
5021 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
5022
5023 trace_create_file("buffer_size_kb", 0644, d_tracer,
5024 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
5025
5026 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5027 &global_trace, &tracing_total_entries_fops);
5028
5029 trace_create_file("free_buffer", 0644, d_tracer,
5030 &global_trace, &tracing_free_buffer_fops);
5031
5032 trace_create_file("trace_marker", 0220, d_tracer,
5033 NULL, &tracing_mark_fops);
5034
5035 trace_create_file("saved_cmdlines", 0444, d_tracer, 5992 trace_create_file("saved_cmdlines", 0444, d_tracer,
5036 NULL, &tracing_saved_cmdlines_fops); 5993 NULL, &tracing_saved_cmdlines_fops);
5037 5994
5038 trace_create_file("trace_clock", 0644, d_tracer, NULL,
5039 &trace_clock_fops);
5040
5041 trace_create_file("tracing_on", 0644, d_tracer,
5042 &global_trace, &rb_simple_fops);
5043
5044#ifdef CONFIG_DYNAMIC_FTRACE 5995#ifdef CONFIG_DYNAMIC_FTRACE
5045 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 5996 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
5046 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5997 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
5047#endif 5998#endif
5048 5999
5049#ifdef CONFIG_TRACER_SNAPSHOT 6000 create_trace_instances(d_tracer);
5050 trace_create_file("snapshot", 0644, d_tracer,
5051 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5052#endif
5053
5054 create_trace_options_dir();
5055 6001
5056 for_each_tracing_cpu(cpu) 6002 create_trace_options_dir(&global_trace);
5057 tracing_init_debugfs_percpu(cpu);
5058 6003
5059 return 0; 6004 return 0;
5060} 6005}
@@ -5110,8 +6055,8 @@ void
5110trace_printk_seq(struct trace_seq *s) 6055trace_printk_seq(struct trace_seq *s)
5111{ 6056{
5112 /* Probably should print a warning here. */ 6057 /* Probably should print a warning here. */
5113 if (s->len >= 1000) 6058 if (s->len >= TRACE_MAX_PRINT)
5114 s->len = 1000; 6059 s->len = TRACE_MAX_PRINT;
5115 6060
5116 /* should be zero ended, but we are paranoid. */ 6061 /* should be zero ended, but we are paranoid. */
5117 s->buffer[s->len] = 0; 6062 s->buffer[s->len] = 0;
@@ -5124,46 +6069,43 @@ trace_printk_seq(struct trace_seq *s)
5124void trace_init_global_iter(struct trace_iterator *iter) 6069void trace_init_global_iter(struct trace_iterator *iter)
5125{ 6070{
5126 iter->tr = &global_trace; 6071 iter->tr = &global_trace;
5127 iter->trace = current_trace; 6072 iter->trace = iter->tr->current_trace;
5128 iter->cpu_file = TRACE_PIPE_ALL_CPU; 6073 iter->cpu_file = RING_BUFFER_ALL_CPUS;
6074 iter->trace_buffer = &global_trace.trace_buffer;
5129} 6075}
5130 6076
5131static void 6077void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5132__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5133{ 6078{
5134 static arch_spinlock_t ftrace_dump_lock =
5135 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
5136 /* use static because iter can be a bit big for the stack */ 6079 /* use static because iter can be a bit big for the stack */
5137 static struct trace_iterator iter; 6080 static struct trace_iterator iter;
6081 static atomic_t dump_running;
5138 unsigned int old_userobj; 6082 unsigned int old_userobj;
5139 static int dump_ran;
5140 unsigned long flags; 6083 unsigned long flags;
5141 int cnt = 0, cpu; 6084 int cnt = 0, cpu;
5142 6085
5143 /* only one dump */ 6086 /* Only allow one dump user at a time. */
5144 local_irq_save(flags); 6087 if (atomic_inc_return(&dump_running) != 1) {
5145 arch_spin_lock(&ftrace_dump_lock); 6088 atomic_dec(&dump_running);
5146 if (dump_ran) 6089 return;
5147 goto out; 6090 }
5148
5149 dump_ran = 1;
5150 6091
6092 /*
6093 * Always turn off tracing when we dump.
6094 * We don't need to show trace output of what happens
6095 * between multiple crashes.
6096 *
6097 * If the user does a sysrq-z, then they can re-enable
6098 * tracing with echo 1 > tracing_on.
6099 */
5151 tracing_off(); 6100 tracing_off();
5152 6101
5153 /* Did function tracer already get disabled? */ 6102 local_irq_save(flags);
5154 if (ftrace_is_dead()) {
5155 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
5156 printk("# MAY BE MISSING FUNCTION EVENTS\n");
5157 }
5158
5159 if (disable_tracing)
5160 ftrace_kill();
5161 6103
5162 /* Simulate the iterator */ 6104 /* Simulate the iterator */
5163 trace_init_global_iter(&iter); 6105 trace_init_global_iter(&iter);
5164 6106
5165 for_each_tracing_cpu(cpu) { 6107 for_each_tracing_cpu(cpu) {
5166 atomic_inc(&iter.tr->data[cpu]->disabled); 6108 atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
5167 } 6109 }
5168 6110
5169 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 6111 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5173,7 +6115,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5173 6115
5174 switch (oops_dump_mode) { 6116 switch (oops_dump_mode) {
5175 case DUMP_ALL: 6117 case DUMP_ALL:
5176 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6118 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5177 break; 6119 break;
5178 case DUMP_ORIG: 6120 case DUMP_ORIG:
5179 iter.cpu_file = raw_smp_processor_id(); 6121 iter.cpu_file = raw_smp_processor_id();
@@ -5182,11 +6124,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5182 goto out_enable; 6124 goto out_enable;
5183 default: 6125 default:
5184 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); 6126 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
5185 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6127 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5186 } 6128 }
5187 6129
5188 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 6130 printk(KERN_TRACE "Dumping ftrace buffer:\n");
5189 6131
6132 /* Did function tracer already get disabled? */
6133 if (ftrace_is_dead()) {
6134 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
6135 printk("# MAY BE MISSING FUNCTION EVENTS\n");
6136 }
6137
5190 /* 6138 /*
5191 * We need to stop all tracing on all CPUS to read the 6139 * We need to stop all tracing on all CPUS to read the
5192 * the next buffer. This is a bit expensive, but is 6140 * the next buffer. This is a bit expensive, but is
@@ -5226,33 +6174,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5226 printk(KERN_TRACE "---------------------------------\n"); 6174 printk(KERN_TRACE "---------------------------------\n");
5227 6175
5228 out_enable: 6176 out_enable:
5229 /* Re-enable tracing if requested */ 6177 trace_flags |= old_userobj;
5230 if (!disable_tracing) {
5231 trace_flags |= old_userobj;
5232 6178
5233 for_each_tracing_cpu(cpu) { 6179 for_each_tracing_cpu(cpu) {
5234 atomic_dec(&iter.tr->data[cpu]->disabled); 6180 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
5235 }
5236 tracing_on();
5237 } 6181 }
5238 6182 atomic_dec(&dump_running);
5239 out:
5240 arch_spin_unlock(&ftrace_dump_lock);
5241 local_irq_restore(flags); 6183 local_irq_restore(flags);
5242} 6184}
5243
5244/* By default: disable tracing after the dump */
5245void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5246{
5247 __ftrace_dump(true, oops_dump_mode);
5248}
5249EXPORT_SYMBOL_GPL(ftrace_dump); 6185EXPORT_SYMBOL_GPL(ftrace_dump);
5250 6186
5251__init static int tracer_alloc_buffers(void) 6187__init static int tracer_alloc_buffers(void)
5252{ 6188{
5253 int ring_buf_size; 6189 int ring_buf_size;
5254 enum ring_buffer_flags rb_flags;
5255 int i;
5256 int ret = -ENOMEM; 6190 int ret = -ENOMEM;
5257 6191
5258 6192
@@ -5273,46 +6207,29 @@ __init static int tracer_alloc_buffers(void)
5273 else 6207 else
5274 ring_buf_size = 1; 6208 ring_buf_size = 1;
5275 6209
5276 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5277
5278 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6210 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
5279 cpumask_copy(tracing_cpumask, cpu_all_mask); 6211 cpumask_copy(tracing_cpumask, cpu_all_mask);
5280 6212
6213 raw_spin_lock_init(&global_trace.start_lock);
6214
5281 /* TODO: make the number of buffers hot pluggable with CPUS */ 6215 /* TODO: make the number of buffers hot pluggable with CPUS */
5282 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); 6216 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
5283 if (!global_trace.buffer) {
5284 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6217 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
5285 WARN_ON(1); 6218 WARN_ON(1);
5286 goto out_free_cpumask; 6219 goto out_free_cpumask;
5287 } 6220 }
6221
5288 if (global_trace.buffer_disabled) 6222 if (global_trace.buffer_disabled)
5289 tracing_off(); 6223 tracing_off();
5290 6224
5291
5292#ifdef CONFIG_TRACER_MAX_TRACE
5293 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
5294 if (!max_tr.buffer) {
5295 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
5296 WARN_ON(1);
5297 ring_buffer_free(global_trace.buffer);
5298 goto out_free_cpumask;
5299 }
5300#endif
5301
5302 /* Allocate the first page for all buffers */
5303 for_each_tracing_cpu(i) {
5304 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
5305 max_tr.data[i] = &per_cpu(max_tr_data, i);
5306 }
5307
5308 set_buffer_entries(&global_trace,
5309 ring_buffer_size(global_trace.buffer, 0));
5310#ifdef CONFIG_TRACER_MAX_TRACE
5311 set_buffer_entries(&max_tr, 1);
5312#endif
5313
5314 trace_init_cmdlines(); 6225 trace_init_cmdlines();
5315 init_irq_work(&trace_work_wakeup, trace_wake_up); 6226
6227 /*
6228 * register_tracer() might reference current_trace, so it
6229 * needs to be set before we register anything. This is
6230 * just a bootstrap of current_trace anyway.
6231 */
6232 global_trace.current_trace = &nop_trace;
5316 6233
5317 register_tracer(&nop_trace); 6234 register_tracer(&nop_trace);
5318 6235
@@ -5324,16 +6241,32 @@ __init static int tracer_alloc_buffers(void)
5324 6241
5325 register_die_notifier(&trace_die_notifier); 6242 register_die_notifier(&trace_die_notifier);
5326 6243
6244 global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
6245
6246 /* Holder for file callbacks */
6247 global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
6248 global_trace.trace_cpu.tr = &global_trace;
6249
6250 INIT_LIST_HEAD(&global_trace.systems);
6251 INIT_LIST_HEAD(&global_trace.events);
6252 list_add(&global_trace.list, &ftrace_trace_arrays);
6253
5327 while (trace_boot_options) { 6254 while (trace_boot_options) {
5328 char *option; 6255 char *option;
5329 6256
5330 option = strsep(&trace_boot_options, ","); 6257 option = strsep(&trace_boot_options, ",");
5331 trace_set_options(option); 6258 trace_set_options(&global_trace, option);
5332 } 6259 }
5333 6260
6261 register_snapshot_cmd();
6262
5334 return 0; 6263 return 0;
5335 6264
5336out_free_cpumask: 6265out_free_cpumask:
6266 free_percpu(global_trace.trace_buffer.data);
6267#ifdef CONFIG_TRACER_MAX_TRACE
6268 free_percpu(global_trace.max_buffer.data);
6269#endif
5337 free_cpumask_var(tracing_cpumask); 6270 free_cpumask_var(tracing_cpumask);
5338out_free_buffer_mask: 6271out_free_buffer_mask:
5339 free_cpumask_var(tracing_buffer_mask); 6272 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 57d7e5397d56..711ca7d3e7f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
13#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
14#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
15 15
16#ifdef CONFIG_FTRACE_SYSCALLS
17#include <asm/unistd.h> /* For NR_SYSCALLS */
18#include <asm/syscall.h> /* some archs define it here */
19#endif
20
16enum trace_type { 21enum trace_type {
17 __TRACE_FIRST_TYPE = 0, 22 __TRACE_FIRST_TYPE = 0,
18 23
@@ -29,6 +34,7 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 35 TRACE_USER_STACK,
31 TRACE_BLK, 36 TRACE_BLK,
37 TRACE_BPUTS,
32 38
33 __TRACE_LAST_TYPE, 39 __TRACE_LAST_TYPE,
34}; 40};
@@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 109 unsigned long ret_ip;
104}; 110};
105 111
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
111/* 112/*
112 * trace_flag_type is an enumeration that holds different 113 * trace_flag_type is an enumeration that holds different
113 * states when a trace occurs. These are: 114 * states when a trace occurs. These are:
@@ -127,12 +128,21 @@ enum trace_flag_type {
127 128
128#define TRACE_BUF_SIZE 1024 129#define TRACE_BUF_SIZE 1024
129 130
131struct trace_array;
132
133struct trace_cpu {
134 struct trace_array *tr;
135 struct dentry *dir;
136 int cpu;
137};
138
130/* 139/*
131 * The CPU trace array - it consists of thousands of trace entries 140 * The CPU trace array - it consists of thousands of trace entries
132 * plus some other descriptor data: (for example which task started 141 * plus some other descriptor data: (for example which task started
133 * the trace, etc.) 142 * the trace, etc.)
134 */ 143 */
135struct trace_array_cpu { 144struct trace_array_cpu {
145 struct trace_cpu trace_cpu;
136 atomic_t disabled; 146 atomic_t disabled;
137 void *buffer_page; /* ring buffer spare */ 147 void *buffer_page; /* ring buffer spare */
138 148
@@ -151,20 +161,83 @@ struct trace_array_cpu {
151 char comm[TASK_COMM_LEN]; 161 char comm[TASK_COMM_LEN];
152}; 162};
153 163
164struct tracer;
165
166struct trace_buffer {
167 struct trace_array *tr;
168 struct ring_buffer *buffer;
169 struct trace_array_cpu __percpu *data;
170 cycle_t time_start;
171 int cpu;
172};
173
154/* 174/*
155 * The trace array - an array of per-CPU trace arrays. This is the 175 * The trace array - an array of per-CPU trace arrays. This is the
156 * highest level data structure that individual tracers deal with. 176 * highest level data structure that individual tracers deal with.
157 * They have on/off state as well: 177 * They have on/off state as well:
158 */ 178 */
159struct trace_array { 179struct trace_array {
160 struct ring_buffer *buffer; 180 struct list_head list;
161 int cpu; 181 char *name;
182 struct trace_buffer trace_buffer;
183#ifdef CONFIG_TRACER_MAX_TRACE
184 /*
185 * The max_buffer is used to snapshot the trace when a maximum
186 * latency is reached, or when the user initiates a snapshot.
187 * Some tracers will use this to store a maximum trace while
188 * it continues examining live traces.
189 *
190 * The buffers for the max_buffer are set up the same as the trace_buffer
191 * When a snapshot is taken, the buffer of the max_buffer is swapped
192 * with the buffer of the trace_buffer and the buffers are reset for
193 * the trace_buffer so the tracing can continue.
194 */
195 struct trace_buffer max_buffer;
196 bool allocated_snapshot;
197#endif
162 int buffer_disabled; 198 int buffer_disabled;
163 cycle_t time_start; 199 struct trace_cpu trace_cpu; /* place holder */
200#ifdef CONFIG_FTRACE_SYSCALLS
201 int sys_refcount_enter;
202 int sys_refcount_exit;
203 DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
204 DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
205#endif
206 int stop_count;
207 int clock_id;
208 struct tracer *current_trace;
209 unsigned int flags;
210 raw_spinlock_t start_lock;
211 struct dentry *dir;
212 struct dentry *options;
213 struct dentry *percpu_dir;
214 struct dentry *event_dir;
215 struct list_head systems;
216 struct list_head events;
164 struct task_struct *waiter; 217 struct task_struct *waiter;
165 struct trace_array_cpu *data[NR_CPUS]; 218 int ref;
219};
220
221enum {
222 TRACE_ARRAY_FL_GLOBAL = (1 << 0)
166}; 223};
167 224
225extern struct list_head ftrace_trace_arrays;
226
227/*
228 * The global tracer (top) should be the first trace array added,
229 * but we check the flag anyway.
230 */
231static inline struct trace_array *top_trace_array(void)
232{
233 struct trace_array *tr;
234
235 tr = list_entry(ftrace_trace_arrays.prev,
236 typeof(*tr), list);
237 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
238 return tr;
239}
240
168#define FTRACE_CMP_TYPE(var, type) \ 241#define FTRACE_CMP_TYPE(var, type) \
169 __builtin_types_compatible_p(typeof(var), type *) 242 __builtin_types_compatible_p(typeof(var), type *)
170 243
@@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);
200 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 273 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
201 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 274 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
202 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 275 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
276 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
203 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 277 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
204 TRACE_MMIO_RW); \ 278 TRACE_MMIO_RW); \
205 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 279 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -283,11 +357,16 @@ struct tracer {
283 enum print_line_t (*print_line)(struct trace_iterator *iter); 357 enum print_line_t (*print_line)(struct trace_iterator *iter);
284 /* If you handled the flag setting, return 0 */ 358 /* If you handled the flag setting, return 0 */
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 359 int (*set_flag)(u32 old_flags, u32 bit, int set);
360 /* Return 0 if OK with change, else return non-zero */
361 int (*flag_changed)(struct tracer *tracer,
362 u32 mask, int set);
286 struct tracer *next; 363 struct tracer *next;
287 struct tracer_flags *flags; 364 struct tracer_flags *flags;
288 bool print_max; 365 bool print_max;
366 bool enabled;
367#ifdef CONFIG_TRACER_MAX_TRACE
289 bool use_max_tr; 368 bool use_max_tr;
290 bool allocated_snapshot; 369#endif
291}; 370};
292 371
293 372
@@ -423,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)
423 current->trace_recursion = val; 502 current->trace_recursion = val;
424} 503}
425 504
426#define TRACE_PIPE_ALL_CPU -1
427
428static inline struct ring_buffer_iter * 505static inline struct ring_buffer_iter *
429trace_buffer_iter(struct trace_iterator *iter, int cpu) 506trace_buffer_iter(struct trace_iterator *iter, int cpu)
430{ 507{
@@ -435,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
435 512
436int tracer_init(struct tracer *t, struct trace_array *tr); 513int tracer_init(struct tracer *t, struct trace_array *tr);
437int tracing_is_enabled(void); 514int tracing_is_enabled(void);
438void tracing_reset(struct trace_array *tr, int cpu); 515void tracing_reset(struct trace_buffer *buf, int cpu);
439void tracing_reset_online_cpus(struct trace_array *tr); 516void tracing_reset_online_cpus(struct trace_buffer *buf);
440void tracing_reset_current(int cpu); 517void tracing_reset_current(int cpu);
441void tracing_reset_current_online_cpus(void); 518void tracing_reset_all_online_cpus(void);
442int tracing_open_generic(struct inode *inode, struct file *filp); 519int tracing_open_generic(struct inode *inode, struct file *filp);
443struct dentry *trace_create_file(const char *name, 520struct dentry *trace_create_file(const char *name,
444 umode_t mode, 521 umode_t mode,
@@ -446,6 +523,7 @@ struct dentry *trace_create_file(const char *name,
446 void *data, 523 void *data,
447 const struct file_operations *fops); 524 const struct file_operations *fops);
448 525
526struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
449struct dentry *tracing_init_dentry(void); 527struct dentry *tracing_init_dentry(void);
450 528
451struct ring_buffer_event; 529struct ring_buffer_event;
@@ -579,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
579#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 657#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
580extern int DYN_FTRACE_TEST_NAME2(void); 658extern int DYN_FTRACE_TEST_NAME2(void);
581 659
582extern int ring_buffer_expanded; 660extern bool ring_buffer_expanded;
583extern bool tracing_selftest_disabled; 661extern bool tracing_selftest_disabled;
584DECLARE_PER_CPU(int, ftrace_cpu_disabled); 662DECLARE_PER_CPU(int, ftrace_cpu_disabled);
585 663
@@ -615,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,
615 unsigned long ip, const char *fmt, va_list args); 693 unsigned long ip, const char *fmt, va_list args);
616int trace_array_printk(struct trace_array *tr, 694int trace_array_printk(struct trace_array *tr,
617 unsigned long ip, const char *fmt, ...); 695 unsigned long ip, const char *fmt, ...);
696int trace_array_printk_buf(struct ring_buffer *buffer,
697 unsigned long ip, const char *fmt, ...);
618void trace_printk_seq(struct trace_seq *s); 698void trace_printk_seq(struct trace_seq *s);
619enum print_line_t print_trace_line(struct trace_iterator *iter); 699enum print_line_t print_trace_line(struct trace_iterator *iter);
620 700
@@ -782,6 +862,7 @@ enum trace_iterator_flags {
782 TRACE_ITER_STOP_ON_FREE = 0x400000, 862 TRACE_ITER_STOP_ON_FREE = 0x400000,
783 TRACE_ITER_IRQ_INFO = 0x800000, 863 TRACE_ITER_IRQ_INFO = 0x800000,
784 TRACE_ITER_MARKERS = 0x1000000, 864 TRACE_ITER_MARKERS = 0x1000000,
865 TRACE_ITER_FUNCTION = 0x2000000,
785}; 866};
786 867
787/* 868/*
@@ -828,8 +909,8 @@ enum {
828 909
829struct ftrace_event_field { 910struct ftrace_event_field {
830 struct list_head link; 911 struct list_head link;
831 char *name; 912 const char *name;
832 char *type; 913 const char *type;
833 int filter_type; 914 int filter_type;
834 int offset; 915 int offset;
835 int size; 916 int size;
@@ -847,12 +928,19 @@ struct event_filter {
847struct event_subsystem { 928struct event_subsystem {
848 struct list_head list; 929 struct list_head list;
849 const char *name; 930 const char *name;
850 struct dentry *entry;
851 struct event_filter *filter; 931 struct event_filter *filter;
852 int nr_events;
853 int ref_count; 932 int ref_count;
854}; 933};
855 934
935struct ftrace_subsystem_dir {
936 struct list_head list;
937 struct event_subsystem *subsystem;
938 struct trace_array *tr;
939 struct dentry *entry;
940 int ref_count;
941 int nr_events;
942};
943
856#define FILTER_PRED_INVALID ((unsigned short)-1) 944#define FILTER_PRED_INVALID ((unsigned short)-1)
857#define FILTER_PRED_IS_RIGHT (1 << 15) 945#define FILTER_PRED_IS_RIGHT (1 << 15)
858#define FILTER_PRED_FOLD (1 << 15) 946#define FILTER_PRED_FOLD (1 << 15)
@@ -902,22 +990,20 @@ struct filter_pred {
902 unsigned short right; 990 unsigned short right;
903}; 991};
904 992
905extern struct list_head ftrace_common_fields;
906
907extern enum regex_type 993extern enum regex_type
908filter_parse_regex(char *buff, int len, char **search, int *not); 994filter_parse_regex(char *buff, int len, char **search, int *not);
909extern void print_event_filter(struct ftrace_event_call *call, 995extern void print_event_filter(struct ftrace_event_call *call,
910 struct trace_seq *s); 996 struct trace_seq *s);
911extern int apply_event_filter(struct ftrace_event_call *call, 997extern int apply_event_filter(struct ftrace_event_call *call,
912 char *filter_string); 998 char *filter_string);
913extern int apply_subsystem_event_filter(struct event_subsystem *system, 999extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
914 char *filter_string); 1000 char *filter_string);
915extern void print_subsystem_event_filter(struct event_subsystem *system, 1001extern void print_subsystem_event_filter(struct event_subsystem *system,
916 struct trace_seq *s); 1002 struct trace_seq *s);
917extern int filter_assign_type(const char *type); 1003extern int filter_assign_type(const char *type);
918 1004
919struct list_head * 1005struct ftrace_event_field *
920trace_get_fields(struct ftrace_event_call *event_call); 1006trace_find_event_field(struct ftrace_event_call *call, char *name);
921 1007
922static inline int 1008static inline int
923filter_check_discard(struct ftrace_event_call *call, void *rec, 1009filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -934,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
934} 1020}
935 1021
936extern void trace_event_enable_cmd_record(bool enable); 1022extern void trace_event_enable_cmd_record(bool enable);
1023extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1024extern int event_trace_del_tracer(struct trace_array *tr);
937 1025
938extern struct mutex event_mutex; 1026extern struct mutex event_mutex;
939extern struct list_head ftrace_events; 1027extern struct list_head ftrace_events;
@@ -943,6 +1031,19 @@ extern const char *__stop___trace_bprintk_fmt[];
943 1031
944void trace_printk_init_buffers(void); 1032void trace_printk_init_buffers(void);
945void trace_printk_start_comm(void); 1033void trace_printk_start_comm(void);
1034int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
1035int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
1036
1037/*
1038 * Normal trace_printk() and friends allocates special buffers
1039 * to do the manipulation, as well as saves the print formats
1040 * into sections to display. But the trace infrastructure wants
1041 * to use these without the added overhead at the price of being
1042 * a bit slower (used mainly for warnings, where we don't care
1043 * about performance). The internal_trace_puts() is for such
1044 * a purpose.
1045 */
1046#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
946 1047
947#undef FTRACE_ENTRY 1048#undef FTRACE_ENTRY
948#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 1049#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch; 33 struct ftrace_event_call *call = &event_branch;
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct trace_array_cpu *data;
35 struct ring_buffer_event *event; 36 struct ring_buffer_event *event;
36 struct trace_branch *entry; 37 struct trace_branch *entry;
37 struct ring_buffer *buffer; 38 struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
51 52
52 local_irq_save(flags); 53 local_irq_save(flags);
53 cpu = raw_smp_processor_id(); 54 cpu = raw_smp_processor_id();
54 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 55 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
56 if (atomic_inc_return(&data->disabled) != 1)
55 goto out; 57 goto out;
56 58
57 pc = preempt_count(); 59 pc = preempt_count();
58 buffer = tr->buffer; 60 buffer = tr->trace_buffer.buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, 61 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
60 sizeof(*entry), flags, pc); 62 sizeof(*entry), flags, pc);
61 if (!event) 63 if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
80 __buffer_unlock_commit(buffer, event); 82 __buffer_unlock_commit(buffer, event);
81 83
82 out: 84 out:
83 atomic_dec(&tr->data[cpu]->disabled); 85 atomic_dec(&data->disabled);
84 local_irq_restore(flags); 86 local_irq_restore(flags);
85} 87}
86 88
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
57 return local_clock(); 57 return local_clock();
58} 58}
59 59
60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 */
63u64 notrace trace_clock_jiffies(void)
64{
65 u64 jiffy = jiffies - INITIAL_JIFFIES;
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69}
60 70
61/* 71/*
62 * trace_clock_global(): special globally coherent trace clock 72 * trace_clock_global(): special globally coherent trace clock
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
223 __dynamic_array( u32, buf ) 223 __dynamic_array( u32, buf )
224 ), 224 ),
225 225
226 F_printk("%08lx fmt:%p", 226 F_printk("%pf: %s",
227 __entry->ip, __entry->fmt), 227 (void *)__entry->ip, __entry->fmt),
228 228
229 FILTER_OTHER 229 FILTER_OTHER
230); 230);
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
238 __dynamic_array( char, buf ) 238 __dynamic_array( char, buf )
239 ), 239 ),
240 240
241 F_printk("%08lx %s", 241 F_printk("%pf: %s",
242 __entry->ip, __entry->buf), 242 (void *)__entry->ip, __entry->buf),
243
244 FILTER_OTHER
245);
246
247FTRACE_ENTRY(bputs, bputs_entry,
248
249 TRACE_BPUTS,
250
251 F_STRUCT(
252 __field( unsigned long, ip )
253 __field( const char *, str )
254 ),
255
256 F_printk("%pf: %s",
257 (void *)__entry->ip, __entry->str),
243 258
244 FILTER_OTHER 259 FILTER_OTHER
245); 260);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..27963e2bf4bf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage); 34EXPORT_SYMBOL_GPL(event_storage);
35 35
36LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
37LIST_HEAD(ftrace_common_fields); 37static LIST_HEAD(ftrace_common_fields);
38 38
39struct list_head * 39#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
40
41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep;
43
44/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
47 list_for_each_entry(file, &tr->events, list)
48
49#define do_for_each_event_file_safe(tr, file) \
50 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
51 struct ftrace_event_file *___n; \
52 list_for_each_entry_safe(file, ___n, &tr->events, list)
53
54#define while_for_each_event_file() \
55 }
56
57static struct list_head *
40trace_get_fields(struct ftrace_event_call *event_call) 58trace_get_fields(struct ftrace_event_call *event_call)
41{ 59{
42 if (!event_call->class->get_fields) 60 if (!event_call->class->get_fields)
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
44 return event_call->class->get_fields(event_call); 62 return event_call->class->get_fields(event_call);
45} 63}
46 64
65static struct ftrace_event_field *
66__find_event_field(struct list_head *head, char *name)
67{
68 struct ftrace_event_field *field;
69
70 list_for_each_entry(field, head, link) {
71 if (!strcmp(field->name, name))
72 return field;
73 }
74
75 return NULL;
76}
77
78struct ftrace_event_field *
79trace_find_event_field(struct ftrace_event_call *call, char *name)
80{
81 struct ftrace_event_field *field;
82 struct list_head *head;
83
84 field = __find_event_field(&ftrace_common_fields, name);
85 if (field)
86 return field;
87
88 head = trace_get_fields(call);
89 return __find_event_field(head, name);
90}
91
47static int __trace_define_field(struct list_head *head, const char *type, 92static int __trace_define_field(struct list_head *head, const char *type,
48 const char *name, int offset, int size, 93 const char *name, int offset, int size,
49 int is_signed, int filter_type) 94 int is_signed, int filter_type)
50{ 95{
51 struct ftrace_event_field *field; 96 struct ftrace_event_field *field;
52 97
53 field = kzalloc(sizeof(*field), GFP_KERNEL); 98 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
54 if (!field) 99 if (!field)
55 goto err; 100 goto err;
56 101
57 field->name = kstrdup(name, GFP_KERNEL); 102 field->name = name;
58 if (!field->name) 103 field->type = type;
59 goto err;
60
61 field->type = kstrdup(type, GFP_KERNEL);
62 if (!field->type)
63 goto err;
64 104
65 if (filter_type == FILTER_OTHER) 105 if (filter_type == FILTER_OTHER)
66 field->filter_type = filter_assign_type(type); 106 field->filter_type = filter_assign_type(type);
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
76 return 0; 116 return 0;
77 117
78err: 118err:
79 if (field) 119 kmem_cache_free(field_cachep, field);
80 kfree(field->name);
81 kfree(field);
82 120
83 return -ENOMEM; 121 return -ENOMEM;
84} 122}
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void)
120 return ret; 158 return ret;
121} 159}
122 160
123void trace_destroy_fields(struct ftrace_event_call *call) 161static void trace_destroy_fields(struct ftrace_event_call *call)
124{ 162{
125 struct ftrace_event_field *field, *next; 163 struct ftrace_event_field *field, *next;
126 struct list_head *head; 164 struct list_head *head;
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
128 head = trace_get_fields(call); 166 head = trace_get_fields(call);
129 list_for_each_entry_safe(field, next, head, link) { 167 list_for_each_entry_safe(field, next, head, link) {
130 list_del(&field->link); 168 list_del(&field->link);
131 kfree(field->type); 169 kmem_cache_free(field_cachep, field);
132 kfree(field->name);
133 kfree(field);
134 } 170 }
135} 171}
136 172
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
149int ftrace_event_reg(struct ftrace_event_call *call, 185int ftrace_event_reg(struct ftrace_event_call *call,
150 enum trace_reg type, void *data) 186 enum trace_reg type, void *data)
151{ 187{
188 struct ftrace_event_file *file = data;
189
152 switch (type) { 190 switch (type) {
153 case TRACE_REG_REGISTER: 191 case TRACE_REG_REGISTER:
154 return tracepoint_probe_register(call->name, 192 return tracepoint_probe_register(call->name,
155 call->class->probe, 193 call->class->probe,
156 call); 194 file);
157 case TRACE_REG_UNREGISTER: 195 case TRACE_REG_UNREGISTER:
158 tracepoint_probe_unregister(call->name, 196 tracepoint_probe_unregister(call->name,
159 call->class->probe, 197 call->class->probe,
160 call); 198 file);
161 return 0; 199 return 0;
162 200
163#ifdef CONFIG_PERF_EVENTS 201#ifdef CONFIG_PERF_EVENTS
@@ -183,54 +221,106 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
183 221
184void trace_event_enable_cmd_record(bool enable) 222void trace_event_enable_cmd_record(bool enable)
185{ 223{
186 struct ftrace_event_call *call; 224 struct ftrace_event_file *file;
225 struct trace_array *tr;
187 226
188 mutex_lock(&event_mutex); 227 mutex_lock(&event_mutex);
189 list_for_each_entry(call, &ftrace_events, list) { 228 do_for_each_event_file(tr, file) {
190 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) 229
230 if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
191 continue; 231 continue;
192 232
193 if (enable) { 233 if (enable) {
194 tracing_start_cmdline_record(); 234 tracing_start_cmdline_record();
195 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 235 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
196 } else { 236 } else {
197 tracing_stop_cmdline_record(); 237 tracing_stop_cmdline_record();
198 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 238 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
199 } 239 }
200 } 240 } while_for_each_event_file();
201 mutex_unlock(&event_mutex); 241 mutex_unlock(&event_mutex);
202} 242}
203 243
204static int ftrace_event_enable_disable(struct ftrace_event_call *call, 244static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
205 int enable) 245 int enable, int soft_disable)
206{ 246{
247 struct ftrace_event_call *call = file->event_call;
207 int ret = 0; 248 int ret = 0;
249 int disable;
208 250
209 switch (enable) { 251 switch (enable) {
210 case 0: 252 case 0:
211 if (call->flags & TRACE_EVENT_FL_ENABLED) { 253 /*
212 call->flags &= ~TRACE_EVENT_FL_ENABLED; 254 * When soft_disable is set and enable is cleared, the sm_ref
213 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { 255 * reference counter is decremented. If it reaches 0, we want
256 * to clear the SOFT_DISABLED flag but leave the event in the
257 * state that it was. That is, if the event was enabled and
258 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
259 * is set we do not want the event to be enabled before we
260 * clear the bit.
261 *
262 * When soft_disable is not set but the SOFT_MODE flag is,
263 * we do nothing. Do not disable the tracepoint, otherwise
264 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
265 */
266 if (soft_disable) {
267 if (atomic_dec_return(&file->sm_ref) > 0)
268 break;
269 disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
270 clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
271 } else
272 disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
273
274 if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
275 clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
276 if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
214 tracing_stop_cmdline_record(); 277 tracing_stop_cmdline_record();
215 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 278 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
216 } 279 }
217 call->class->reg(call, TRACE_REG_UNREGISTER, NULL); 280 call->class->reg(call, TRACE_REG_UNREGISTER, file);
218 } 281 }
282 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
283 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
284 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
219 break; 285 break;
220 case 1: 286 case 1:
221 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 287 /*
288 * When soft_disable is set and enable is set, we want to
289 * register the tracepoint for the event, but leave the event
290 * as is. That means, if the event was already enabled, we do
291 * nothing (but set SOFT_MODE). If the event is disabled, we
292 * set SOFT_DISABLED before enabling the event tracepoint, so
293 * it still seems to be disabled.
294 */
295 if (!soft_disable)
296 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
297 else {
298 if (atomic_inc_return(&file->sm_ref) > 1)
299 break;
300 set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
301 }
302
303 if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
304
305 /* Keep the event disabled, when going to SOFT_MODE. */
306 if (soft_disable)
307 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
308
222 if (trace_flags & TRACE_ITER_RECORD_CMD) { 309 if (trace_flags & TRACE_ITER_RECORD_CMD) {
223 tracing_start_cmdline_record(); 310 tracing_start_cmdline_record();
224 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 311 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
225 } 312 }
226 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); 313 ret = call->class->reg(call, TRACE_REG_REGISTER, file);
227 if (ret) { 314 if (ret) {
228 tracing_stop_cmdline_record(); 315 tracing_stop_cmdline_record();
229 pr_info("event trace: Could not enable event " 316 pr_info("event trace: Could not enable event "
230 "%s\n", call->name); 317 "%s\n", call->name);
231 break; 318 break;
232 } 319 }
233 call->flags |= TRACE_EVENT_FL_ENABLED; 320 set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
321
322 /* WAS_ENABLED gets set but never cleared. */
323 call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
234 } 324 }
235 break; 325 break;
236 } 326 }
@@ -238,13 +328,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
238 return ret; 328 return ret;
239} 329}
240 330
241static void ftrace_clear_events(void) 331static int ftrace_event_enable_disable(struct ftrace_event_file *file,
332 int enable)
242{ 333{
243 struct ftrace_event_call *call; 334 return __ftrace_event_enable_disable(file, enable, 0);
335}
336
337static void ftrace_clear_events(struct trace_array *tr)
338{
339 struct ftrace_event_file *file;
244 340
245 mutex_lock(&event_mutex); 341 mutex_lock(&event_mutex);
246 list_for_each_entry(call, &ftrace_events, list) { 342 list_for_each_entry(file, &tr->events, list) {
247 ftrace_event_enable_disable(call, 0); 343 ftrace_event_enable_disable(file, 0);
248 } 344 }
249 mutex_unlock(&event_mutex); 345 mutex_unlock(&event_mutex);
250} 346}
@@ -257,11 +353,12 @@ static void __put_system(struct event_subsystem *system)
257 if (--system->ref_count) 353 if (--system->ref_count)
258 return; 354 return;
259 355
356 list_del(&system->list);
357
260 if (filter) { 358 if (filter) {
261 kfree(filter->filter_string); 359 kfree(filter->filter_string);
262 kfree(filter); 360 kfree(filter);
263 } 361 }
264 kfree(system->name);
265 kfree(system); 362 kfree(system);
266} 363}
267 364
@@ -271,24 +368,45 @@ static void __get_system(struct event_subsystem *system)
271 system->ref_count++; 368 system->ref_count++;
272} 369}
273 370
274static void put_system(struct event_subsystem *system) 371static void __get_system_dir(struct ftrace_subsystem_dir *dir)
372{
373 WARN_ON_ONCE(dir->ref_count == 0);
374 dir->ref_count++;
375 __get_system(dir->subsystem);
376}
377
378static void __put_system_dir(struct ftrace_subsystem_dir *dir)
379{
380 WARN_ON_ONCE(dir->ref_count == 0);
381 /* If the subsystem is about to be freed, the dir must be too */
382 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
383
384 __put_system(dir->subsystem);
385 if (!--dir->ref_count)
386 kfree(dir);
387}
388
389static void put_system(struct ftrace_subsystem_dir *dir)
275{ 390{
276 mutex_lock(&event_mutex); 391 mutex_lock(&event_mutex);
277 __put_system(system); 392 __put_system_dir(dir);
278 mutex_unlock(&event_mutex); 393 mutex_unlock(&event_mutex);
279} 394}
280 395
281/* 396/*
282 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 397 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
283 */ 398 */
284static int __ftrace_set_clr_event(const char *match, const char *sub, 399static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
285 const char *event, int set) 400 const char *sub, const char *event, int set)
286{ 401{
402 struct ftrace_event_file *file;
287 struct ftrace_event_call *call; 403 struct ftrace_event_call *call;
288 int ret = -EINVAL; 404 int ret = -EINVAL;
289 405
290 mutex_lock(&event_mutex); 406 mutex_lock(&event_mutex);
291 list_for_each_entry(call, &ftrace_events, list) { 407 list_for_each_entry(file, &tr->events, list) {
408
409 call = file->event_call;
292 410
293 if (!call->name || !call->class || !call->class->reg) 411 if (!call->name || !call->class || !call->class->reg)
294 continue; 412 continue;
@@ -307,7 +425,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
307 if (event && strcmp(event, call->name) != 0) 425 if (event && strcmp(event, call->name) != 0)
308 continue; 426 continue;
309 427
310 ftrace_event_enable_disable(call, set); 428 ftrace_event_enable_disable(file, set);
311 429
312 ret = 0; 430 ret = 0;
313 } 431 }
@@ -316,7 +434,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
316 return ret; 434 return ret;
317} 435}
318 436
319static int ftrace_set_clr_event(char *buf, int set) 437static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
320{ 438{
321 char *event = NULL, *sub = NULL, *match; 439 char *event = NULL, *sub = NULL, *match;
322 440
@@ -344,7 +462,7 @@ static int ftrace_set_clr_event(char *buf, int set)
344 event = NULL; 462 event = NULL;
345 } 463 }
346 464
347 return __ftrace_set_clr_event(match, sub, event, set); 465 return __ftrace_set_clr_event(tr, match, sub, event, set);
348} 466}
349 467
350/** 468/**
@@ -361,7 +479,9 @@ static int ftrace_set_clr_event(char *buf, int set)
361 */ 479 */
362int trace_set_clr_event(const char *system, const char *event, int set) 480int trace_set_clr_event(const char *system, const char *event, int set)
363{ 481{
364 return __ftrace_set_clr_event(NULL, system, event, set); 482 struct trace_array *tr = top_trace_array();
483
484 return __ftrace_set_clr_event(tr, NULL, system, event, set);
365} 485}
366EXPORT_SYMBOL_GPL(trace_set_clr_event); 486EXPORT_SYMBOL_GPL(trace_set_clr_event);
367 487
@@ -373,6 +493,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
373 size_t cnt, loff_t *ppos) 493 size_t cnt, loff_t *ppos)
374{ 494{
375 struct trace_parser parser; 495 struct trace_parser parser;
496 struct seq_file *m = file->private_data;
497 struct trace_array *tr = m->private;
376 ssize_t read, ret; 498 ssize_t read, ret;
377 499
378 if (!cnt) 500 if (!cnt)
@@ -395,7 +517,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
395 517
396 parser.buffer[parser.idx] = 0; 518 parser.buffer[parser.idx] = 0;
397 519
398 ret = ftrace_set_clr_event(parser.buffer + !set, set); 520 ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
399 if (ret) 521 if (ret)
400 goto out_put; 522 goto out_put;
401 } 523 }
@@ -411,17 +533,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
411static void * 533static void *
412t_next(struct seq_file *m, void *v, loff_t *pos) 534t_next(struct seq_file *m, void *v, loff_t *pos)
413{ 535{
414 struct ftrace_event_call *call = v; 536 struct ftrace_event_file *file = v;
537 struct ftrace_event_call *call;
538 struct trace_array *tr = m->private;
415 539
416 (*pos)++; 540 (*pos)++;
417 541
418 list_for_each_entry_continue(call, &ftrace_events, list) { 542 list_for_each_entry_continue(file, &tr->events, list) {
543 call = file->event_call;
419 /* 544 /*
420 * The ftrace subsystem is for showing formats only. 545 * The ftrace subsystem is for showing formats only.
421 * They can not be enabled or disabled via the event files. 546 * They can not be enabled or disabled via the event files.
422 */ 547 */
423 if (call->class && call->class->reg) 548 if (call->class && call->class->reg)
424 return call; 549 return file;
425 } 550 }
426 551
427 return NULL; 552 return NULL;
@@ -429,30 +554,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
429 554
430static void *t_start(struct seq_file *m, loff_t *pos) 555static void *t_start(struct seq_file *m, loff_t *pos)
431{ 556{
432 struct ftrace_event_call *call; 557 struct ftrace_event_file *file;
558 struct trace_array *tr = m->private;
433 loff_t l; 559 loff_t l;
434 560
435 mutex_lock(&event_mutex); 561 mutex_lock(&event_mutex);
436 562
437 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 563 file = list_entry(&tr->events, struct ftrace_event_file, list);
438 for (l = 0; l <= *pos; ) { 564 for (l = 0; l <= *pos; ) {
439 call = t_next(m, call, &l); 565 file = t_next(m, file, &l);
440 if (!call) 566 if (!file)
441 break; 567 break;
442 } 568 }
443 return call; 569 return file;
444} 570}
445 571
446static void * 572static void *
447s_next(struct seq_file *m, void *v, loff_t *pos) 573s_next(struct seq_file *m, void *v, loff_t *pos)
448{ 574{
449 struct ftrace_event_call *call = v; 575 struct ftrace_event_file *file = v;
576 struct trace_array *tr = m->private;
450 577
451 (*pos)++; 578 (*pos)++;
452 579
453 list_for_each_entry_continue(call, &ftrace_events, list) { 580 list_for_each_entry_continue(file, &tr->events, list) {
454 if (call->flags & TRACE_EVENT_FL_ENABLED) 581 if (file->flags & FTRACE_EVENT_FL_ENABLED)
455 return call; 582 return file;
456 } 583 }
457 584
458 return NULL; 585 return NULL;
@@ -460,23 +587,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
460 587
461static void *s_start(struct seq_file *m, loff_t *pos) 588static void *s_start(struct seq_file *m, loff_t *pos)
462{ 589{
463 struct ftrace_event_call *call; 590 struct ftrace_event_file *file;
591 struct trace_array *tr = m->private;
464 loff_t l; 592 loff_t l;
465 593
466 mutex_lock(&event_mutex); 594 mutex_lock(&event_mutex);
467 595
468 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 596 file = list_entry(&tr->events, struct ftrace_event_file, list);
469 for (l = 0; l <= *pos; ) { 597 for (l = 0; l <= *pos; ) {
470 call = s_next(m, call, &l); 598 file = s_next(m, file, &l);
471 if (!call) 599 if (!file)
472 break; 600 break;
473 } 601 }
474 return call; 602 return file;
475} 603}
476 604
477static int t_show(struct seq_file *m, void *v) 605static int t_show(struct seq_file *m, void *v)
478{ 606{
479 struct ftrace_event_call *call = v; 607 struct ftrace_event_file *file = v;
608 struct ftrace_event_call *call = file->event_call;
480 609
481 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 610 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
482 seq_printf(m, "%s:", call->class->system); 611 seq_printf(m, "%s:", call->class->system);
@@ -494,25 +623,33 @@ static ssize_t
494event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 623event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
495 loff_t *ppos) 624 loff_t *ppos)
496{ 625{
497 struct ftrace_event_call *call = filp->private_data; 626 struct ftrace_event_file *file = filp->private_data;
498 char *buf; 627 char *buf;
499 628
500 if (call->flags & TRACE_EVENT_FL_ENABLED) 629 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
501 buf = "1\n"; 630 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
502 else 631 buf = "0*\n";
632 else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
633 buf = "1*\n";
634 else
635 buf = "1\n";
636 } else
503 buf = "0\n"; 637 buf = "0\n";
504 638
505 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); 639 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
506} 640}
507 641
508static ssize_t 642static ssize_t
509event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 643event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
510 loff_t *ppos) 644 loff_t *ppos)
511{ 645{
512 struct ftrace_event_call *call = filp->private_data; 646 struct ftrace_event_file *file = filp->private_data;
513 unsigned long val; 647 unsigned long val;
514 int ret; 648 int ret;
515 649
650 if (!file)
651 return -EINVAL;
652
516 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 653 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
517 if (ret) 654 if (ret)
518 return ret; 655 return ret;
@@ -525,7 +662,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
525 case 0: 662 case 0:
526 case 1: 663 case 1:
527 mutex_lock(&event_mutex); 664 mutex_lock(&event_mutex);
528 ret = ftrace_event_enable_disable(call, val); 665 ret = ftrace_event_enable_disable(file, val);
529 mutex_unlock(&event_mutex); 666 mutex_unlock(&event_mutex);
530 break; 667 break;
531 668
@@ -543,14 +680,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
543 loff_t *ppos) 680 loff_t *ppos)
544{ 681{
545 const char set_to_char[4] = { '?', '0', '1', 'X' }; 682 const char set_to_char[4] = { '?', '0', '1', 'X' };
546 struct event_subsystem *system = filp->private_data; 683 struct ftrace_subsystem_dir *dir = filp->private_data;
684 struct event_subsystem *system = dir->subsystem;
547 struct ftrace_event_call *call; 685 struct ftrace_event_call *call;
686 struct ftrace_event_file *file;
687 struct trace_array *tr = dir->tr;
548 char buf[2]; 688 char buf[2];
549 int set = 0; 689 int set = 0;
550 int ret; 690 int ret;
551 691
552 mutex_lock(&event_mutex); 692 mutex_lock(&event_mutex);
553 list_for_each_entry(call, &ftrace_events, list) { 693 list_for_each_entry(file, &tr->events, list) {
694 call = file->event_call;
554 if (!call->name || !call->class || !call->class->reg) 695 if (!call->name || !call->class || !call->class->reg)
555 continue; 696 continue;
556 697
@@ -562,7 +703,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
562 * or if all events or cleared, or if we have 703 * or if all events or cleared, or if we have
563 * a mixture. 704 * a mixture.
564 */ 705 */
565 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); 706 set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
566 707
567 /* 708 /*
568 * If we have a mixture, no need to look further. 709 * If we have a mixture, no need to look further.
@@ -584,7 +725,8 @@ static ssize_t
584system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 725system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
585 loff_t *ppos) 726 loff_t *ppos)
586{ 727{
587 struct event_subsystem *system = filp->private_data; 728 struct ftrace_subsystem_dir *dir = filp->private_data;
729 struct event_subsystem *system = dir->subsystem;
588 const char *name = NULL; 730 const char *name = NULL;
589 unsigned long val; 731 unsigned long val;
590 ssize_t ret; 732 ssize_t ret;
@@ -607,7 +749,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
607 if (system) 749 if (system)
608 name = system->name; 750 name = system->name;
609 751
610 ret = __ftrace_set_clr_event(NULL, name, NULL, val); 752 ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
611 if (ret) 753 if (ret)
612 goto out; 754 goto out;
613 755
@@ -845,43 +987,75 @@ static LIST_HEAD(event_subsystems);
845static int subsystem_open(struct inode *inode, struct file *filp) 987static int subsystem_open(struct inode *inode, struct file *filp)
846{ 988{
847 struct event_subsystem *system = NULL; 989 struct event_subsystem *system = NULL;
990 struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
991 struct trace_array *tr;
848 int ret; 992 int ret;
849 993
850 if (!inode->i_private)
851 goto skip_search;
852
853 /* Make sure the system still exists */ 994 /* Make sure the system still exists */
854 mutex_lock(&event_mutex); 995 mutex_lock(&event_mutex);
855 list_for_each_entry(system, &event_subsystems, list) { 996 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
856 if (system == inode->i_private) { 997 list_for_each_entry(dir, &tr->systems, list) {
857 /* Don't open systems with no events */ 998 if (dir == inode->i_private) {
858 if (!system->nr_events) { 999 /* Don't open systems with no events */
859 system = NULL; 1000 if (dir->nr_events) {
860 break; 1001 __get_system_dir(dir);
1002 system = dir->subsystem;
1003 }
1004 goto exit_loop;
861 } 1005 }
862 __get_system(system);
863 break;
864 } 1006 }
865 } 1007 }
1008 exit_loop:
866 mutex_unlock(&event_mutex); 1009 mutex_unlock(&event_mutex);
867 1010
868 if (system != inode->i_private) 1011 if (!system)
869 return -ENODEV; 1012 return -ENODEV;
870 1013
871 skip_search: 1014 /* Some versions of gcc think dir can be uninitialized here */
1015 WARN_ON(!dir);
1016
1017 ret = tracing_open_generic(inode, filp);
1018 if (ret < 0)
1019 put_system(dir);
1020
1021 return ret;
1022}
1023
1024static int system_tr_open(struct inode *inode, struct file *filp)
1025{
1026 struct ftrace_subsystem_dir *dir;
1027 struct trace_array *tr = inode->i_private;
1028 int ret;
1029
1030 /* Make a temporary dir that has no system but points to tr */
1031 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1032 if (!dir)
1033 return -ENOMEM;
1034
1035 dir->tr = tr;
1036
872 ret = tracing_open_generic(inode, filp); 1037 ret = tracing_open_generic(inode, filp);
873 if (ret < 0 && system) 1038 if (ret < 0)
874 put_system(system); 1039 kfree(dir);
1040
1041 filp->private_data = dir;
875 1042
876 return ret; 1043 return ret;
877} 1044}
878 1045
879static int subsystem_release(struct inode *inode, struct file *file) 1046static int subsystem_release(struct inode *inode, struct file *file)
880{ 1047{
881 struct event_subsystem *system = inode->i_private; 1048 struct ftrace_subsystem_dir *dir = file->private_data;
882 1049
883 if (system) 1050 /*
884 put_system(system); 1051 * If dir->subsystem is NULL, then this is a temporary
1052 * descriptor that was made for a trace_array to enable
1053 * all subsystems.
1054 */
1055 if (dir->subsystem)
1056 put_system(dir);
1057 else
1058 kfree(dir);
885 1059
886 return 0; 1060 return 0;
887} 1061}
@@ -890,7 +1064,8 @@ static ssize_t
890subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 1064subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
891 loff_t *ppos) 1065 loff_t *ppos)
892{ 1066{
893 struct event_subsystem *system = filp->private_data; 1067 struct ftrace_subsystem_dir *dir = filp->private_data;
1068 struct event_subsystem *system = dir->subsystem;
894 struct trace_seq *s; 1069 struct trace_seq *s;
895 int r; 1070 int r;
896 1071
@@ -915,7 +1090,7 @@ static ssize_t
915subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1090subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
916 loff_t *ppos) 1091 loff_t *ppos)
917{ 1092{
918 struct event_subsystem *system = filp->private_data; 1093 struct ftrace_subsystem_dir *dir = filp->private_data;
919 char *buf; 1094 char *buf;
920 int err; 1095 int err;
921 1096
@@ -932,7 +1107,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
932 } 1107 }
933 buf[cnt] = '\0'; 1108 buf[cnt] = '\0';
934 1109
935 err = apply_subsystem_event_filter(system, buf); 1110 err = apply_subsystem_event_filter(dir, buf);
936 free_page((unsigned long) buf); 1111 free_page((unsigned long) buf);
937 if (err < 0) 1112 if (err < 0)
938 return err; 1113 return err;
@@ -1041,30 +1216,35 @@ static const struct file_operations ftrace_system_enable_fops = {
1041 .release = subsystem_release, 1216 .release = subsystem_release,
1042}; 1217};
1043 1218
1219static const struct file_operations ftrace_tr_enable_fops = {
1220 .open = system_tr_open,
1221 .read = system_enable_read,
1222 .write = system_enable_write,
1223 .llseek = default_llseek,
1224 .release = subsystem_release,
1225};
1226
1044static const struct file_operations ftrace_show_header_fops = { 1227static const struct file_operations ftrace_show_header_fops = {
1045 .open = tracing_open_generic, 1228 .open = tracing_open_generic,
1046 .read = show_header, 1229 .read = show_header,
1047 .llseek = default_llseek, 1230 .llseek = default_llseek,
1048}; 1231};
1049 1232
1050static struct dentry *event_trace_events_dir(void) 1233static int
1234ftrace_event_open(struct inode *inode, struct file *file,
1235 const struct seq_operations *seq_ops)
1051{ 1236{
1052 static struct dentry *d_tracer; 1237 struct seq_file *m;
1053 static struct dentry *d_events; 1238 int ret;
1054
1055 if (d_events)
1056 return d_events;
1057
1058 d_tracer = tracing_init_dentry();
1059 if (!d_tracer)
1060 return NULL;
1061 1239
1062 d_events = debugfs_create_dir("events", d_tracer); 1240 ret = seq_open(file, seq_ops);
1063 if (!d_events) 1241 if (ret < 0)
1064 pr_warning("Could not create debugfs " 1242 return ret;
1065 "'events' directory\n"); 1243 m = file->private_data;
1244 /* copy tr over to seq ops */
1245 m->private = inode->i_private;
1066 1246
1067 return d_events; 1247 return ret;
1068} 1248}
1069 1249
1070static int 1250static int
@@ -1072,117 +1252,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
1072{ 1252{
1073 const struct seq_operations *seq_ops = &show_event_seq_ops; 1253 const struct seq_operations *seq_ops = &show_event_seq_ops;
1074 1254
1075 return seq_open(file, seq_ops); 1255 return ftrace_event_open(inode, file, seq_ops);
1076} 1256}
1077 1257
1078static int 1258static int
1079ftrace_event_set_open(struct inode *inode, struct file *file) 1259ftrace_event_set_open(struct inode *inode, struct file *file)
1080{ 1260{
1081 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1261 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1262 struct trace_array *tr = inode->i_private;
1082 1263
1083 if ((file->f_mode & FMODE_WRITE) && 1264 if ((file->f_mode & FMODE_WRITE) &&
1084 (file->f_flags & O_TRUNC)) 1265 (file->f_flags & O_TRUNC))
1085 ftrace_clear_events(); 1266 ftrace_clear_events(tr);
1086 1267
1087 return seq_open(file, seq_ops); 1268 return ftrace_event_open(inode, file, seq_ops);
1269}
1270
1271static struct event_subsystem *
1272create_new_subsystem(const char *name)
1273{
1274 struct event_subsystem *system;
1275
1276 /* need to create new entry */
1277 system = kmalloc(sizeof(*system), GFP_KERNEL);
1278 if (!system)
1279 return NULL;
1280
1281 system->ref_count = 1;
1282 system->name = name;
1283
1284 system->filter = NULL;
1285
1286 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
1287 if (!system->filter)
1288 goto out_free;
1289
1290 list_add(&system->list, &event_subsystems);
1291
1292 return system;
1293
1294 out_free:
1295 kfree(system);
1296 return NULL;
1088} 1297}
1089 1298
1090static struct dentry * 1299static struct dentry *
1091event_subsystem_dir(const char *name, struct dentry *d_events) 1300event_subsystem_dir(struct trace_array *tr, const char *name,
1301 struct ftrace_event_file *file, struct dentry *parent)
1092{ 1302{
1303 struct ftrace_subsystem_dir *dir;
1093 struct event_subsystem *system; 1304 struct event_subsystem *system;
1094 struct dentry *entry; 1305 struct dentry *entry;
1095 1306
1096 /* First see if we did not already create this dir */ 1307 /* First see if we did not already create this dir */
1097 list_for_each_entry(system, &event_subsystems, list) { 1308 list_for_each_entry(dir, &tr->systems, list) {
1309 system = dir->subsystem;
1098 if (strcmp(system->name, name) == 0) { 1310 if (strcmp(system->name, name) == 0) {
1099 system->nr_events++; 1311 dir->nr_events++;
1100 return system->entry; 1312 file->system = dir;
1313 return dir->entry;
1101 } 1314 }
1102 } 1315 }
1103 1316
1104 /* need to create new entry */ 1317 /* Now see if the system itself exists. */
1105 system = kmalloc(sizeof(*system), GFP_KERNEL); 1318 list_for_each_entry(system, &event_subsystems, list) {
1106 if (!system) { 1319 if (strcmp(system->name, name) == 0)
1107 pr_warning("No memory to create event subsystem %s\n", 1320 break;
1108 name);
1109 return d_events;
1110 } 1321 }
1322 /* Reset system variable when not found */
1323 if (&system->list == &event_subsystems)
1324 system = NULL;
1111 1325
1112 system->entry = debugfs_create_dir(name, d_events); 1326 dir = kmalloc(sizeof(*dir), GFP_KERNEL);
1113 if (!system->entry) { 1327 if (!dir)
1114 pr_warning("Could not create event subsystem %s\n", 1328 goto out_fail;
1115 name);
1116 kfree(system);
1117 return d_events;
1118 }
1119 1329
1120 system->nr_events = 1; 1330 if (!system) {
1121 system->ref_count = 1; 1331 system = create_new_subsystem(name);
1122 system->name = kstrdup(name, GFP_KERNEL); 1332 if (!system)
1123 if (!system->name) { 1333 goto out_free;
1124 debugfs_remove(system->entry); 1334 } else
1125 kfree(system); 1335 __get_system(system);
1126 return d_events; 1336
1337 dir->entry = debugfs_create_dir(name, parent);
1338 if (!dir->entry) {
1339 pr_warning("Failed to create system directory %s\n", name);
1340 __put_system(system);
1341 goto out_free;
1127 } 1342 }
1128 1343
1129 list_add(&system->list, &event_subsystems); 1344 dir->tr = tr;
1130 1345 dir->ref_count = 1;
1131 system->filter = NULL; 1346 dir->nr_events = 1;
1132 1347 dir->subsystem = system;
1133 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); 1348 file->system = dir;
1134 if (!system->filter) {
1135 pr_warning("Could not allocate filter for subsystem "
1136 "'%s'\n", name);
1137 return system->entry;
1138 }
1139 1349
1140 entry = debugfs_create_file("filter", 0644, system->entry, system, 1350 entry = debugfs_create_file("filter", 0644, dir->entry, dir,
1141 &ftrace_subsystem_filter_fops); 1351 &ftrace_subsystem_filter_fops);
1142 if (!entry) { 1352 if (!entry) {
1143 kfree(system->filter); 1353 kfree(system->filter);
1144 system->filter = NULL; 1354 system->filter = NULL;
1145 pr_warning("Could not create debugfs " 1355 pr_warning("Could not create debugfs '%s/filter' entry\n", name);
1146 "'%s/filter' entry\n", name);
1147 } 1356 }
1148 1357
1149 trace_create_file("enable", 0644, system->entry, system, 1358 trace_create_file("enable", 0644, dir->entry, dir,
1150 &ftrace_system_enable_fops); 1359 &ftrace_system_enable_fops);
1151 1360
1152 return system->entry; 1361 list_add(&dir->list, &tr->systems);
1362
1363 return dir->entry;
1364
1365 out_free:
1366 kfree(dir);
1367 out_fail:
1368 /* Only print this message if failed on memory allocation */
1369 if (!dir || !system)
1370 pr_warning("No memory to create event subsystem %s\n",
1371 name);
1372 return NULL;
1153} 1373}
1154 1374
1155static int 1375static int
1156event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, 1376event_create_dir(struct dentry *parent,
1377 struct ftrace_event_file *file,
1157 const struct file_operations *id, 1378 const struct file_operations *id,
1158 const struct file_operations *enable, 1379 const struct file_operations *enable,
1159 const struct file_operations *filter, 1380 const struct file_operations *filter,
1160 const struct file_operations *format) 1381 const struct file_operations *format)
1161{ 1382{
1383 struct ftrace_event_call *call = file->event_call;
1384 struct trace_array *tr = file->tr;
1162 struct list_head *head; 1385 struct list_head *head;
1386 struct dentry *d_events;
1163 int ret; 1387 int ret;
1164 1388
1165 /* 1389 /*
1166 * If the trace point header did not define TRACE_SYSTEM 1390 * If the trace point header did not define TRACE_SYSTEM
1167 * then the system would be called "TRACE_SYSTEM". 1391 * then the system would be called "TRACE_SYSTEM".
1168 */ 1392 */
1169 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 1393 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
1170 d_events = event_subsystem_dir(call->class->system, d_events); 1394 d_events = event_subsystem_dir(tr, call->class->system, file, parent);
1171 1395 if (!d_events)
1172 call->dir = debugfs_create_dir(call->name, d_events); 1396 return -ENOMEM;
1173 if (!call->dir) { 1397 } else
1174 pr_warning("Could not create debugfs " 1398 d_events = parent;
1175 "'%s' directory\n", call->name); 1399
1400 file->dir = debugfs_create_dir(call->name, d_events);
1401 if (!file->dir) {
1402 pr_warning("Could not create debugfs '%s' directory\n",
1403 call->name);
1176 return -1; 1404 return -1;
1177 } 1405 }
1178 1406
1179 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1407 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1180 trace_create_file("enable", 0644, call->dir, call, 1408 trace_create_file("enable", 0644, file->dir, file,
1181 enable); 1409 enable);
1182 1410
1183#ifdef CONFIG_PERF_EVENTS 1411#ifdef CONFIG_PERF_EVENTS
1184 if (call->event.type && call->class->reg) 1412 if (call->event.type && call->class->reg)
1185 trace_create_file("id", 0444, call->dir, call, 1413 trace_create_file("id", 0444, file->dir, call,
1186 id); 1414 id);
1187#endif 1415#endif
1188 1416
@@ -1196,23 +1424,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1196 if (ret < 0) { 1424 if (ret < 0) {
1197 pr_warning("Could not initialize trace point" 1425 pr_warning("Could not initialize trace point"
1198 " events/%s\n", call->name); 1426 " events/%s\n", call->name);
1199 return ret; 1427 return -1;
1200 } 1428 }
1201 } 1429 }
1202 trace_create_file("filter", 0644, call->dir, call, 1430 trace_create_file("filter", 0644, file->dir, call,
1203 filter); 1431 filter);
1204 1432
1205 trace_create_file("format", 0444, call->dir, call, 1433 trace_create_file("format", 0444, file->dir, call,
1206 format); 1434 format);
1207 1435
1208 return 0; 1436 return 0;
1209} 1437}
1210 1438
1439static void remove_subsystem(struct ftrace_subsystem_dir *dir)
1440{
1441 if (!dir)
1442 return;
1443
1444 if (!--dir->nr_events) {
1445 debugfs_remove_recursive(dir->entry);
1446 list_del(&dir->list);
1447 __put_system_dir(dir);
1448 }
1449}
1450
1451static void remove_event_from_tracers(struct ftrace_event_call *call)
1452{
1453 struct ftrace_event_file *file;
1454 struct trace_array *tr;
1455
1456 do_for_each_event_file_safe(tr, file) {
1457
1458 if (file->event_call != call)
1459 continue;
1460
1461 list_del(&file->list);
1462 debugfs_remove_recursive(file->dir);
1463 remove_subsystem(file->system);
1464 kmem_cache_free(file_cachep, file);
1465
1466 /*
1467 * The do_for_each_event_file_safe() is
1468 * a double loop. After finding the call for this
1469 * trace_array, we use break to jump to the next
1470 * trace_array.
1471 */
1472 break;
1473 } while_for_each_event_file();
1474}
1475
1211static void event_remove(struct ftrace_event_call *call) 1476static void event_remove(struct ftrace_event_call *call)
1212{ 1477{
1213 ftrace_event_enable_disable(call, 0); 1478 struct trace_array *tr;
1479 struct ftrace_event_file *file;
1480
1481 do_for_each_event_file(tr, file) {
1482 if (file->event_call != call)
1483 continue;
1484 ftrace_event_enable_disable(file, 0);
1485 /*
1486 * The do_for_each_event_file() is
1487 * a double loop. After finding the call for this
1488 * trace_array, we use break to jump to the next
1489 * trace_array.
1490 */
1491 break;
1492 } while_for_each_event_file();
1493
1214 if (call->event.funcs) 1494 if (call->event.funcs)
1215 __unregister_ftrace_event(&call->event); 1495 __unregister_ftrace_event(&call->event);
1496 remove_event_from_tracers(call);
1216 list_del(&call->list); 1497 list_del(&call->list);
1217} 1498}
1218 1499
@@ -1234,82 +1515,109 @@ static int event_init(struct ftrace_event_call *call)
1234} 1515}
1235 1516
1236static int 1517static int
1237__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, 1518__register_event(struct ftrace_event_call *call, struct module *mod)
1238 const struct file_operations *id,
1239 const struct file_operations *enable,
1240 const struct file_operations *filter,
1241 const struct file_operations *format)
1242{ 1519{
1243 struct dentry *d_events;
1244 int ret; 1520 int ret;
1245 1521
1246 ret = event_init(call); 1522 ret = event_init(call);
1247 if (ret < 0) 1523 if (ret < 0)
1248 return ret; 1524 return ret;
1249 1525
1250 d_events = event_trace_events_dir(); 1526 list_add(&call->list, &ftrace_events);
1251 if (!d_events)
1252 return -ENOENT;
1253
1254 ret = event_create_dir(call, d_events, id, enable, filter, format);
1255 if (!ret)
1256 list_add(&call->list, &ftrace_events);
1257 call->mod = mod; 1527 call->mod = mod;
1258 1528
1259 return ret; 1529 return 0;
1530}
1531
1532static struct ftrace_event_file *
1533trace_create_new_event(struct ftrace_event_call *call,
1534 struct trace_array *tr)
1535{
1536 struct ftrace_event_file *file;
1537
1538 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1539 if (!file)
1540 return NULL;
1541
1542 file->event_call = call;
1543 file->tr = tr;
1544 atomic_set(&file->sm_ref, 0);
1545 list_add(&file->list, &tr->events);
1546
1547 return file;
1260} 1548}
1261 1549
1550/* Add an event to a trace directory */
1551static int
1552__trace_add_new_event(struct ftrace_event_call *call,
1553 struct trace_array *tr,
1554 const struct file_operations *id,
1555 const struct file_operations *enable,
1556 const struct file_operations *filter,
1557 const struct file_operations *format)
1558{
1559 struct ftrace_event_file *file;
1560
1561 file = trace_create_new_event(call, tr);
1562 if (!file)
1563 return -ENOMEM;
1564
1565 return event_create_dir(tr->event_dir, file, id, enable, filter, format);
1566}
1567
1568/*
1569 * Just create a decriptor for early init. A descriptor is required
1570 * for enabling events at boot. We want to enable events before
1571 * the filesystem is initialized.
1572 */
1573static __init int
1574__trace_early_add_new_event(struct ftrace_event_call *call,
1575 struct trace_array *tr)
1576{
1577 struct ftrace_event_file *file;
1578
1579 file = trace_create_new_event(call, tr);
1580 if (!file)
1581 return -ENOMEM;
1582
1583 return 0;
1584}
1585
1586struct ftrace_module_file_ops;
1587static void __add_event_to_tracers(struct ftrace_event_call *call,
1588 struct ftrace_module_file_ops *file_ops);
1589
1262/* Add an additional event_call dynamically */ 1590/* Add an additional event_call dynamically */
1263int trace_add_event_call(struct ftrace_event_call *call) 1591int trace_add_event_call(struct ftrace_event_call *call)
1264{ 1592{
1265 int ret; 1593 int ret;
1266 mutex_lock(&event_mutex); 1594 mutex_lock(&event_mutex);
1267 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1268 &ftrace_enable_fops,
1269 &ftrace_event_filter_fops,
1270 &ftrace_event_format_fops);
1271 mutex_unlock(&event_mutex);
1272 return ret;
1273}
1274
1275static void remove_subsystem_dir(const char *name)
1276{
1277 struct event_subsystem *system;
1278 1595
1279 if (strcmp(name, TRACE_SYSTEM) == 0) 1596 ret = __register_event(call, NULL);
1280 return; 1597 if (ret >= 0)
1598 __add_event_to_tracers(call, NULL);
1281 1599
1282 list_for_each_entry(system, &event_subsystems, list) { 1600 mutex_unlock(&event_mutex);
1283 if (strcmp(system->name, name) == 0) { 1601 return ret;
1284 if (!--system->nr_events) {
1285 debugfs_remove_recursive(system->entry);
1286 list_del(&system->list);
1287 __put_system(system);
1288 }
1289 break;
1290 }
1291 }
1292} 1602}
1293 1603
1294/* 1604/*
1295 * Must be called under locking both of event_mutex and trace_event_mutex. 1605 * Must be called under locking both of event_mutex and trace_event_sem.
1296 */ 1606 */
1297static void __trace_remove_event_call(struct ftrace_event_call *call) 1607static void __trace_remove_event_call(struct ftrace_event_call *call)
1298{ 1608{
1299 event_remove(call); 1609 event_remove(call);
1300 trace_destroy_fields(call); 1610 trace_destroy_fields(call);
1301 destroy_preds(call); 1611 destroy_preds(call);
1302 debugfs_remove_recursive(call->dir);
1303 remove_subsystem_dir(call->class->system);
1304} 1612}
1305 1613
1306/* Remove an event_call */ 1614/* Remove an event_call */
1307void trace_remove_event_call(struct ftrace_event_call *call) 1615void trace_remove_event_call(struct ftrace_event_call *call)
1308{ 1616{
1309 mutex_lock(&event_mutex); 1617 mutex_lock(&event_mutex);
1310 down_write(&trace_event_mutex); 1618 down_write(&trace_event_sem);
1311 __trace_remove_event_call(call); 1619 __trace_remove_event_call(call);
1312 up_write(&trace_event_mutex); 1620 up_write(&trace_event_sem);
1313 mutex_unlock(&event_mutex); 1621 mutex_unlock(&event_mutex);
1314} 1622}
1315 1623
@@ -1336,6 +1644,26 @@ struct ftrace_module_file_ops {
1336}; 1644};
1337 1645
1338static struct ftrace_module_file_ops * 1646static struct ftrace_module_file_ops *
1647find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1648{
1649 /*
1650 * As event_calls are added in groups by module,
1651 * when we find one file_ops, we don't need to search for
1652 * each call in that module, as the rest should be the
1653 * same. Only search for a new one if the last one did
1654 * not match.
1655 */
1656 if (file_ops && mod == file_ops->mod)
1657 return file_ops;
1658
1659 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1660 if (file_ops->mod == mod)
1661 return file_ops;
1662 }
1663 return NULL;
1664}
1665
1666static struct ftrace_module_file_ops *
1339trace_create_file_ops(struct module *mod) 1667trace_create_file_ops(struct module *mod)
1340{ 1668{
1341 struct ftrace_module_file_ops *file_ops; 1669 struct ftrace_module_file_ops *file_ops;
@@ -1386,9 +1714,8 @@ static void trace_module_add_events(struct module *mod)
1386 return; 1714 return;
1387 1715
1388 for_each_event(call, start, end) { 1716 for_each_event(call, start, end) {
1389 __trace_add_event_call(*call, mod, 1717 __register_event(*call, mod);
1390 &file_ops->id, &file_ops->enable, 1718 __add_event_to_tracers(*call, file_ops);
1391 &file_ops->filter, &file_ops->format);
1392 } 1719 }
1393} 1720}
1394 1721
@@ -1396,12 +1723,13 @@ static void trace_module_remove_events(struct module *mod)
1396{ 1723{
1397 struct ftrace_module_file_ops *file_ops; 1724 struct ftrace_module_file_ops *file_ops;
1398 struct ftrace_event_call *call, *p; 1725 struct ftrace_event_call *call, *p;
1399 bool found = false; 1726 bool clear_trace = false;
1400 1727
1401 down_write(&trace_event_mutex); 1728 down_write(&trace_event_sem);
1402 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1729 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1403 if (call->mod == mod) { 1730 if (call->mod == mod) {
1404 found = true; 1731 if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
1732 clear_trace = true;
1405 __trace_remove_event_call(call); 1733 __trace_remove_event_call(call);
1406 } 1734 }
1407 } 1735 }
@@ -1415,14 +1743,18 @@ static void trace_module_remove_events(struct module *mod)
1415 list_del(&file_ops->list); 1743 list_del(&file_ops->list);
1416 kfree(file_ops); 1744 kfree(file_ops);
1417 } 1745 }
1746 up_write(&trace_event_sem);
1418 1747
1419 /* 1748 /*
1420 * It is safest to reset the ring buffer if the module being unloaded 1749 * It is safest to reset the ring buffer if the module being unloaded
1421 * registered any events. 1750 * registered any events that were used. The only worry is if
1751 * a new module gets loaded, and takes on the same id as the events
1752 * of this module. When printing out the buffer, traced events left
1753 * over from this module may be passed to the new module events and
1754 * unexpected results may occur.
1422 */ 1755 */
1423 if (found) 1756 if (clear_trace)
1424 tracing_reset_current_online_cpus(); 1757 tracing_reset_all_online_cpus();
1425 up_write(&trace_event_mutex);
1426} 1758}
1427 1759
1428static int trace_module_notify(struct notifier_block *self, 1760static int trace_module_notify(struct notifier_block *self,
@@ -1443,14 +1775,445 @@ static int trace_module_notify(struct notifier_block *self,
1443 1775
1444 return 0; 1776 return 0;
1445} 1777}
1778
1779static int
1780__trace_add_new_mod_event(struct ftrace_event_call *call,
1781 struct trace_array *tr,
1782 struct ftrace_module_file_ops *file_ops)
1783{
1784 return __trace_add_new_event(call, tr,
1785 &file_ops->id, &file_ops->enable,
1786 &file_ops->filter, &file_ops->format);
1787}
1788
1446#else 1789#else
1447static int trace_module_notify(struct notifier_block *self, 1790static inline struct ftrace_module_file_ops *
1448 unsigned long val, void *data) 1791find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1792{
1793 return NULL;
1794}
1795static inline int trace_module_notify(struct notifier_block *self,
1796 unsigned long val, void *data)
1449{ 1797{
1450 return 0; 1798 return 0;
1451} 1799}
1800static inline int
1801__trace_add_new_mod_event(struct ftrace_event_call *call,
1802 struct trace_array *tr,
1803 struct ftrace_module_file_ops *file_ops)
1804{
1805 return -ENODEV;
1806}
1452#endif /* CONFIG_MODULES */ 1807#endif /* CONFIG_MODULES */
1453 1808
1809/* Create a new event directory structure for a trace directory. */
1810static void
1811__trace_add_event_dirs(struct trace_array *tr)
1812{
1813 struct ftrace_module_file_ops *file_ops = NULL;
1814 struct ftrace_event_call *call;
1815 int ret;
1816
1817 list_for_each_entry(call, &ftrace_events, list) {
1818 if (call->mod) {
1819 /*
1820 * Directories for events by modules need to
1821 * keep module ref counts when opened (as we don't
1822 * want the module to disappear when reading one
1823 * of these files). The file_ops keep account of
1824 * the module ref count.
1825 */
1826 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1827 if (!file_ops)
1828 continue; /* Warn? */
1829 ret = __trace_add_new_mod_event(call, tr, file_ops);
1830 if (ret < 0)
1831 pr_warning("Could not create directory for event %s\n",
1832 call->name);
1833 continue;
1834 }
1835 ret = __trace_add_new_event(call, tr,
1836 &ftrace_event_id_fops,
1837 &ftrace_enable_fops,
1838 &ftrace_event_filter_fops,
1839 &ftrace_event_format_fops);
1840 if (ret < 0)
1841 pr_warning("Could not create directory for event %s\n",
1842 call->name);
1843 }
1844}
1845
1846#ifdef CONFIG_DYNAMIC_FTRACE
1847
1848/* Avoid typos */
1849#define ENABLE_EVENT_STR "enable_event"
1850#define DISABLE_EVENT_STR "disable_event"
1851
1852struct event_probe_data {
1853 struct ftrace_event_file *file;
1854 unsigned long count;
1855 int ref;
1856 bool enable;
1857};
1858
1859static struct ftrace_event_file *
1860find_event_file(struct trace_array *tr, const char *system, const char *event)
1861{
1862 struct ftrace_event_file *file;
1863 struct ftrace_event_call *call;
1864
1865 list_for_each_entry(file, &tr->events, list) {
1866
1867 call = file->event_call;
1868
1869 if (!call->name || !call->class || !call->class->reg)
1870 continue;
1871
1872 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
1873 continue;
1874
1875 if (strcmp(event, call->name) == 0 &&
1876 strcmp(system, call->class->system) == 0)
1877 return file;
1878 }
1879 return NULL;
1880}
1881
1882static void
1883event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1884{
1885 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1886 struct event_probe_data *data = *pdata;
1887
1888 if (!data)
1889 return;
1890
1891 if (data->enable)
1892 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1893 else
1894 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1895}
1896
1897static void
1898event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1899{
1900 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1901 struct event_probe_data *data = *pdata;
1902
1903 if (!data)
1904 return;
1905
1906 if (!data->count)
1907 return;
1908
1909 /* Skip if the event is in a state we want to switch to */
1910 if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
1911 return;
1912
1913 if (data->count != -1)
1914 (data->count)--;
1915
1916 event_enable_probe(ip, parent_ip, _data);
1917}
1918
1919static int
1920event_enable_print(struct seq_file *m, unsigned long ip,
1921 struct ftrace_probe_ops *ops, void *_data)
1922{
1923 struct event_probe_data *data = _data;
1924
1925 seq_printf(m, "%ps:", (void *)ip);
1926
1927 seq_printf(m, "%s:%s:%s",
1928 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1929 data->file->event_call->class->system,
1930 data->file->event_call->name);
1931
1932 if (data->count == -1)
1933 seq_printf(m, ":unlimited\n");
1934 else
1935 seq_printf(m, ":count=%ld\n", data->count);
1936
1937 return 0;
1938}
1939
1940static int
1941event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
1942 void **_data)
1943{
1944 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1945 struct event_probe_data *data = *pdata;
1946
1947 data->ref++;
1948 return 0;
1949}
1950
1951static void
1952event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
1953 void **_data)
1954{
1955 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1956 struct event_probe_data *data = *pdata;
1957
1958 if (WARN_ON_ONCE(data->ref <= 0))
1959 return;
1960
1961 data->ref--;
1962 if (!data->ref) {
1963 /* Remove the SOFT_MODE flag */
1964 __ftrace_event_enable_disable(data->file, 0, 1);
1965 module_put(data->file->event_call->mod);
1966 kfree(data);
1967 }
1968 *pdata = NULL;
1969}
1970
1971static struct ftrace_probe_ops event_enable_probe_ops = {
1972 .func = event_enable_probe,
1973 .print = event_enable_print,
1974 .init = event_enable_init,
1975 .free = event_enable_free,
1976};
1977
1978static struct ftrace_probe_ops event_enable_count_probe_ops = {
1979 .func = event_enable_count_probe,
1980 .print = event_enable_print,
1981 .init = event_enable_init,
1982 .free = event_enable_free,
1983};
1984
1985static struct ftrace_probe_ops event_disable_probe_ops = {
1986 .func = event_enable_probe,
1987 .print = event_enable_print,
1988 .init = event_enable_init,
1989 .free = event_enable_free,
1990};
1991
1992static struct ftrace_probe_ops event_disable_count_probe_ops = {
1993 .func = event_enable_count_probe,
1994 .print = event_enable_print,
1995 .init = event_enable_init,
1996 .free = event_enable_free,
1997};
1998
1999static int
2000event_enable_func(struct ftrace_hash *hash,
2001 char *glob, char *cmd, char *param, int enabled)
2002{
2003 struct trace_array *tr = top_trace_array();
2004 struct ftrace_event_file *file;
2005 struct ftrace_probe_ops *ops;
2006 struct event_probe_data *data;
2007 const char *system;
2008 const char *event;
2009 char *number;
2010 bool enable;
2011 int ret;
2012
2013 /* hash funcs only work with set_ftrace_filter */
2014 if (!enabled)
2015 return -EINVAL;
2016
2017 if (!param)
2018 return -EINVAL;
2019
2020 system = strsep(&param, ":");
2021 if (!param)
2022 return -EINVAL;
2023
2024 event = strsep(&param, ":");
2025
2026 mutex_lock(&event_mutex);
2027
2028 ret = -EINVAL;
2029 file = find_event_file(tr, system, event);
2030 if (!file)
2031 goto out;
2032
2033 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
2034
2035 if (enable)
2036 ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
2037 else
2038 ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
2039
2040 if (glob[0] == '!') {
2041 unregister_ftrace_function_probe_func(glob+1, ops);
2042 ret = 0;
2043 goto out;
2044 }
2045
2046 ret = -ENOMEM;
2047 data = kzalloc(sizeof(*data), GFP_KERNEL);
2048 if (!data)
2049 goto out;
2050
2051 data->enable = enable;
2052 data->count = -1;
2053 data->file = file;
2054
2055 if (!param)
2056 goto out_reg;
2057
2058 number = strsep(&param, ":");
2059
2060 ret = -EINVAL;
2061 if (!strlen(number))
2062 goto out_free;
2063
2064 /*
2065 * We use the callback data field (which is a pointer)
2066 * as our counter.
2067 */
2068 ret = kstrtoul(number, 0, &data->count);
2069 if (ret)
2070 goto out_free;
2071
2072 out_reg:
2073 /* Don't let event modules unload while probe registered */
2074 ret = try_module_get(file->event_call->mod);
2075 if (!ret) {
2076 ret = -EBUSY;
2077 goto out_free;
2078 }
2079
2080 ret = __ftrace_event_enable_disable(file, 1, 1);
2081 if (ret < 0)
2082 goto out_put;
2083 ret = register_ftrace_function_probe(glob, ops, data);
2084 /*
2085 * The above returns on success the # of functions enabled,
2086 * but if it didn't find any functions it returns zero.
2087 * Consider no functions a failure too.
2088 */
2089 if (!ret) {
2090 ret = -ENOENT;
2091 goto out_disable;
2092 } else if (ret < 0)
2093 goto out_disable;
2094 /* Just return zero, not the number of enabled functions */
2095 ret = 0;
2096 out:
2097 mutex_unlock(&event_mutex);
2098 return ret;
2099
2100 out_disable:
2101 __ftrace_event_enable_disable(file, 0, 1);
2102 out_put:
2103 module_put(file->event_call->mod);
2104 out_free:
2105 kfree(data);
2106 goto out;
2107}
2108
2109static struct ftrace_func_command event_enable_cmd = {
2110 .name = ENABLE_EVENT_STR,
2111 .func = event_enable_func,
2112};
2113
2114static struct ftrace_func_command event_disable_cmd = {
2115 .name = DISABLE_EVENT_STR,
2116 .func = event_enable_func,
2117};
2118
2119static __init int register_event_cmds(void)
2120{
2121 int ret;
2122
2123 ret = register_ftrace_command(&event_enable_cmd);
2124 if (WARN_ON(ret < 0))
2125 return ret;
2126 ret = register_ftrace_command(&event_disable_cmd);
2127 if (WARN_ON(ret < 0))
2128 unregister_ftrace_command(&event_enable_cmd);
2129 return ret;
2130}
2131#else
2132static inline int register_event_cmds(void) { return 0; }
2133#endif /* CONFIG_DYNAMIC_FTRACE */
2134
2135/*
2136 * The top level array has already had its ftrace_event_file
2137 * descriptors created in order to allow for early events to
2138 * be recorded. This function is called after the debugfs has been
2139 * initialized, and we now have to create the files associated
2140 * to the events.
2141 */
2142static __init void
2143__trace_early_add_event_dirs(struct trace_array *tr)
2144{
2145 struct ftrace_event_file *file;
2146 int ret;
2147
2148
2149 list_for_each_entry(file, &tr->events, list) {
2150 ret = event_create_dir(tr->event_dir, file,
2151 &ftrace_event_id_fops,
2152 &ftrace_enable_fops,
2153 &ftrace_event_filter_fops,
2154 &ftrace_event_format_fops);
2155 if (ret < 0)
2156 pr_warning("Could not create directory for event %s\n",
2157 file->event_call->name);
2158 }
2159}
2160
2161/*
2162 * For early boot up, the top trace array requires to have
2163 * a list of events that can be enabled. This must be done before
2164 * the filesystem is set up in order to allow events to be traced
2165 * early.
2166 */
2167static __init void
2168__trace_early_add_events(struct trace_array *tr)
2169{
2170 struct ftrace_event_call *call;
2171 int ret;
2172
2173 list_for_each_entry(call, &ftrace_events, list) {
2174 /* Early boot up should not have any modules loaded */
2175 if (WARN_ON_ONCE(call->mod))
2176 continue;
2177
2178 ret = __trace_early_add_new_event(call, tr);
2179 if (ret < 0)
2180 pr_warning("Could not create early event %s\n",
2181 call->name);
2182 }
2183}
2184
2185/* Remove the event directory structure for a trace directory. */
2186static void
2187__trace_remove_event_dirs(struct trace_array *tr)
2188{
2189 struct ftrace_event_file *file, *next;
2190
2191 list_for_each_entry_safe(file, next, &tr->events, list) {
2192 list_del(&file->list);
2193 debugfs_remove_recursive(file->dir);
2194 remove_subsystem(file->system);
2195 kmem_cache_free(file_cachep, file);
2196 }
2197}
2198
2199static void
2200__add_event_to_tracers(struct ftrace_event_call *call,
2201 struct ftrace_module_file_ops *file_ops)
2202{
2203 struct trace_array *tr;
2204
2205 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
2206 if (file_ops)
2207 __trace_add_new_mod_event(call, tr, file_ops);
2208 else
2209 __trace_add_new_event(call, tr,
2210 &ftrace_event_id_fops,
2211 &ftrace_enable_fops,
2212 &ftrace_event_filter_fops,
2213 &ftrace_event_format_fops);
2214 }
2215}
2216
1454static struct notifier_block trace_module_nb = { 2217static struct notifier_block trace_module_nb = {
1455 .notifier_call = trace_module_notify, 2218 .notifier_call = trace_module_notify,
1456 .priority = 0, 2219 .priority = 0,
@@ -1464,15 +2227,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1464static __init int setup_trace_event(char *str) 2227static __init int setup_trace_event(char *str)
1465{ 2228{
1466 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); 2229 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1467 ring_buffer_expanded = 1; 2230 ring_buffer_expanded = true;
1468 tracing_selftest_disabled = 1; 2231 tracing_selftest_disabled = true;
1469 2232
1470 return 1; 2233 return 1;
1471} 2234}
1472__setup("trace_event=", setup_trace_event); 2235__setup("trace_event=", setup_trace_event);
1473 2236
2237/* Expects to have event_mutex held when called */
2238static int
2239create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2240{
2241 struct dentry *d_events;
2242 struct dentry *entry;
2243
2244 entry = debugfs_create_file("set_event", 0644, parent,
2245 tr, &ftrace_set_event_fops);
2246 if (!entry) {
2247 pr_warning("Could not create debugfs 'set_event' entry\n");
2248 return -ENOMEM;
2249 }
2250
2251 d_events = debugfs_create_dir("events", parent);
2252 if (!d_events) {
2253 pr_warning("Could not create debugfs 'events' directory\n");
2254 return -ENOMEM;
2255 }
2256
2257 /* ring buffer internal formats */
2258 trace_create_file("header_page", 0444, d_events,
2259 ring_buffer_print_page_header,
2260 &ftrace_show_header_fops);
2261
2262 trace_create_file("header_event", 0444, d_events,
2263 ring_buffer_print_entry_header,
2264 &ftrace_show_header_fops);
2265
2266 trace_create_file("enable", 0644, d_events,
2267 tr, &ftrace_tr_enable_fops);
2268
2269 tr->event_dir = d_events;
2270
2271 return 0;
2272}
2273
2274/**
2275 * event_trace_add_tracer - add a instance of a trace_array to events
2276 * @parent: The parent dentry to place the files/directories for events in
2277 * @tr: The trace array associated with these events
2278 *
2279 * When a new instance is created, it needs to set up its events
2280 * directory, as well as other files associated with events. It also
2281 * creates the event hierachry in the @parent/events directory.
2282 *
2283 * Returns 0 on success.
2284 */
2285int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
2286{
2287 int ret;
2288
2289 mutex_lock(&event_mutex);
2290
2291 ret = create_event_toplevel_files(parent, tr);
2292 if (ret)
2293 goto out_unlock;
2294
2295 down_write(&trace_event_sem);
2296 __trace_add_event_dirs(tr);
2297 up_write(&trace_event_sem);
2298
2299 out_unlock:
2300 mutex_unlock(&event_mutex);
2301
2302 return ret;
2303}
2304
2305/*
2306 * The top trace array already had its file descriptors created.
2307 * Now the files themselves need to be created.
2308 */
2309static __init int
2310early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2311{
2312 int ret;
2313
2314 mutex_lock(&event_mutex);
2315
2316 ret = create_event_toplevel_files(parent, tr);
2317 if (ret)
2318 goto out_unlock;
2319
2320 down_write(&trace_event_sem);
2321 __trace_early_add_event_dirs(tr);
2322 up_write(&trace_event_sem);
2323
2324 out_unlock:
2325 mutex_unlock(&event_mutex);
2326
2327 return ret;
2328}
2329
2330int event_trace_del_tracer(struct trace_array *tr)
2331{
2332 /* Disable any running events */
2333 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2334
2335 mutex_lock(&event_mutex);
2336
2337 down_write(&trace_event_sem);
2338 __trace_remove_event_dirs(tr);
2339 debugfs_remove_recursive(tr->event_dir);
2340 up_write(&trace_event_sem);
2341
2342 tr->event_dir = NULL;
2343
2344 mutex_unlock(&event_mutex);
2345
2346 return 0;
2347}
2348
2349static __init int event_trace_memsetup(void)
2350{
2351 field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
2352 file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
2353 return 0;
2354}
2355
1474static __init int event_trace_enable(void) 2356static __init int event_trace_enable(void)
1475{ 2357{
2358 struct trace_array *tr = top_trace_array();
1476 struct ftrace_event_call **iter, *call; 2359 struct ftrace_event_call **iter, *call;
1477 char *buf = bootup_event_buf; 2360 char *buf = bootup_event_buf;
1478 char *token; 2361 char *token;
@@ -1486,6 +2369,14 @@ static __init int event_trace_enable(void)
1486 list_add(&call->list, &ftrace_events); 2369 list_add(&call->list, &ftrace_events);
1487 } 2370 }
1488 2371
2372 /*
2373 * We need the top trace array to have a working set of trace
2374 * points at early init, before the debug files and directories
2375 * are created. Create the file entries now, and attach them
2376 * to the actual file dentries later.
2377 */
2378 __trace_early_add_events(tr);
2379
1489 while (true) { 2380 while (true) {
1490 token = strsep(&buf, ","); 2381 token = strsep(&buf, ",");
1491 2382
@@ -1494,73 +2385,43 @@ static __init int event_trace_enable(void)
1494 if (!*token) 2385 if (!*token)
1495 continue; 2386 continue;
1496 2387
1497 ret = ftrace_set_clr_event(token, 1); 2388 ret = ftrace_set_clr_event(tr, token, 1);
1498 if (ret) 2389 if (ret)
1499 pr_warn("Failed to enable trace event: %s\n", token); 2390 pr_warn("Failed to enable trace event: %s\n", token);
1500 } 2391 }
1501 2392
1502 trace_printk_start_comm(); 2393 trace_printk_start_comm();
1503 2394
2395 register_event_cmds();
2396
1504 return 0; 2397 return 0;
1505} 2398}
1506 2399
1507static __init int event_trace_init(void) 2400static __init int event_trace_init(void)
1508{ 2401{
1509 struct ftrace_event_call *call; 2402 struct trace_array *tr;
1510 struct dentry *d_tracer; 2403 struct dentry *d_tracer;
1511 struct dentry *entry; 2404 struct dentry *entry;
1512 struct dentry *d_events;
1513 int ret; 2405 int ret;
1514 2406
2407 tr = top_trace_array();
2408
1515 d_tracer = tracing_init_dentry(); 2409 d_tracer = tracing_init_dentry();
1516 if (!d_tracer) 2410 if (!d_tracer)
1517 return 0; 2411 return 0;
1518 2412
1519 entry = debugfs_create_file("available_events", 0444, d_tracer, 2413 entry = debugfs_create_file("available_events", 0444, d_tracer,
1520 NULL, &ftrace_avail_fops); 2414 tr, &ftrace_avail_fops);
1521 if (!entry) 2415 if (!entry)
1522 pr_warning("Could not create debugfs " 2416 pr_warning("Could not create debugfs "
1523 "'available_events' entry\n"); 2417 "'available_events' entry\n");
1524 2418
1525 entry = debugfs_create_file("set_event", 0644, d_tracer,
1526 NULL, &ftrace_set_event_fops);
1527 if (!entry)
1528 pr_warning("Could not create debugfs "
1529 "'set_event' entry\n");
1530
1531 d_events = event_trace_events_dir();
1532 if (!d_events)
1533 return 0;
1534
1535 /* ring buffer internal formats */
1536 trace_create_file("header_page", 0444, d_events,
1537 ring_buffer_print_page_header,
1538 &ftrace_show_header_fops);
1539
1540 trace_create_file("header_event", 0444, d_events,
1541 ring_buffer_print_entry_header,
1542 &ftrace_show_header_fops);
1543
1544 trace_create_file("enable", 0644, d_events,
1545 NULL, &ftrace_system_enable_fops);
1546
1547 if (trace_define_common_fields()) 2419 if (trace_define_common_fields())
1548 pr_warning("tracing: Failed to allocate common fields"); 2420 pr_warning("tracing: Failed to allocate common fields");
1549 2421
1550 /* 2422 ret = early_event_add_tracer(d_tracer, tr);
1551 * Early initialization already enabled ftrace event. 2423 if (ret)
1552 * Now it's only necessary to create the event directory. 2424 return ret;
1553 */
1554 list_for_each_entry(call, &ftrace_events, list) {
1555
1556 ret = event_create_dir(call, d_events,
1557 &ftrace_event_id_fops,
1558 &ftrace_enable_fops,
1559 &ftrace_event_filter_fops,
1560 &ftrace_event_format_fops);
1561 if (ret < 0)
1562 event_remove(call);
1563 }
1564 2425
1565 ret = register_module_notifier(&trace_module_nb); 2426 ret = register_module_notifier(&trace_module_nb);
1566 if (ret) 2427 if (ret)
@@ -1568,6 +2429,7 @@ static __init int event_trace_init(void)
1568 2429
1569 return 0; 2430 return 0;
1570} 2431}
2432early_initcall(event_trace_memsetup);
1571core_initcall(event_trace_enable); 2433core_initcall(event_trace_enable);
1572fs_initcall(event_trace_init); 2434fs_initcall(event_trace_init);
1573 2435
@@ -1627,13 +2489,20 @@ static __init void event_test_stuff(void)
1627 */ 2489 */
1628static __init void event_trace_self_tests(void) 2490static __init void event_trace_self_tests(void)
1629{ 2491{
2492 struct ftrace_subsystem_dir *dir;
2493 struct ftrace_event_file *file;
1630 struct ftrace_event_call *call; 2494 struct ftrace_event_call *call;
1631 struct event_subsystem *system; 2495 struct event_subsystem *system;
2496 struct trace_array *tr;
1632 int ret; 2497 int ret;
1633 2498
2499 tr = top_trace_array();
2500
1634 pr_info("Running tests on trace events:\n"); 2501 pr_info("Running tests on trace events:\n");
1635 2502
1636 list_for_each_entry(call, &ftrace_events, list) { 2503 list_for_each_entry(file, &tr->events, list) {
2504
2505 call = file->event_call;
1637 2506
1638 /* Only test those that have a probe */ 2507 /* Only test those that have a probe */
1639 if (!call->class || !call->class->probe) 2508 if (!call->class || !call->class->probe)
@@ -1657,15 +2526,15 @@ static __init void event_trace_self_tests(void)
1657 * If an event is already enabled, someone is using 2526 * If an event is already enabled, someone is using
1658 * it and the self test should not be on. 2527 * it and the self test should not be on.
1659 */ 2528 */
1660 if (call->flags & TRACE_EVENT_FL_ENABLED) { 2529 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
1661 pr_warning("Enabled event during self test!\n"); 2530 pr_warning("Enabled event during self test!\n");
1662 WARN_ON_ONCE(1); 2531 WARN_ON_ONCE(1);
1663 continue; 2532 continue;
1664 } 2533 }
1665 2534
1666 ftrace_event_enable_disable(call, 1); 2535 ftrace_event_enable_disable(file, 1);
1667 event_test_stuff(); 2536 event_test_stuff();
1668 ftrace_event_enable_disable(call, 0); 2537 ftrace_event_enable_disable(file, 0);
1669 2538
1670 pr_cont("OK\n"); 2539 pr_cont("OK\n");
1671 } 2540 }
@@ -1674,7 +2543,9 @@ static __init void event_trace_self_tests(void)
1674 2543
1675 pr_info("Running tests on trace event systems:\n"); 2544 pr_info("Running tests on trace event systems:\n");
1676 2545
1677 list_for_each_entry(system, &event_subsystems, list) { 2546 list_for_each_entry(dir, &tr->systems, list) {
2547
2548 system = dir->subsystem;
1678 2549
1679 /* the ftrace system is special, skip it */ 2550 /* the ftrace system is special, skip it */
1680 if (strcmp(system->name, "ftrace") == 0) 2551 if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +2553,7 @@ static __init void event_trace_self_tests(void)
1682 2553
1683 pr_info("Testing event system %s: ", system->name); 2554 pr_info("Testing event system %s: ", system->name);
1684 2555
1685 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); 2556 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
1686 if (WARN_ON_ONCE(ret)) { 2557 if (WARN_ON_ONCE(ret)) {
1687 pr_warning("error enabling system %s\n", 2558 pr_warning("error enabling system %s\n",
1688 system->name); 2559 system->name);
@@ -1691,7 +2562,7 @@ static __init void event_trace_self_tests(void)
1691 2562
1692 event_test_stuff(); 2563 event_test_stuff();
1693 2564
1694 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); 2565 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
1695 if (WARN_ON_ONCE(ret)) { 2566 if (WARN_ON_ONCE(ret)) {
1696 pr_warning("error disabling system %s\n", 2567 pr_warning("error disabling system %s\n",
1697 system->name); 2568 system->name);
@@ -1706,7 +2577,7 @@ static __init void event_trace_self_tests(void)
1706 pr_info("Running tests on all trace events:\n"); 2577 pr_info("Running tests on all trace events:\n");
1707 pr_info("Testing all events: "); 2578 pr_info("Testing all events: ");
1708 2579
1709 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); 2580 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
1710 if (WARN_ON_ONCE(ret)) { 2581 if (WARN_ON_ONCE(ret)) {
1711 pr_warning("error enabling all events\n"); 2582 pr_warning("error enabling all events\n");
1712 return; 2583 return;
@@ -1715,7 +2586,7 @@ static __init void event_trace_self_tests(void)
1715 event_test_stuff(); 2586 event_test_stuff();
1716 2587
1717 /* reset sysname */ 2588 /* reset sysname */
1718 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); 2589 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
1719 if (WARN_ON_ONCE(ret)) { 2590 if (WARN_ON_ONCE(ret)) {
1720 pr_warning("error disabling all events\n"); 2591 pr_warning("error disabling all events\n");
1721 return; 2592 return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..e1b653f7e1ca 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
658 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
659} 659}
660 660
661static struct ftrace_event_field *
662__find_event_field(struct list_head *head, char *name)
663{
664 struct ftrace_event_field *field;
665
666 list_for_each_entry(field, head, link) {
667 if (!strcmp(field->name, name))
668 return field;
669 }
670
671 return NULL;
672}
673
674static struct ftrace_event_field *
675find_event_field(struct ftrace_event_call *call, char *name)
676{
677 struct ftrace_event_field *field;
678 struct list_head *head;
679
680 field = __find_event_field(&ftrace_common_fields, name);
681 if (field)
682 return field;
683
684 head = trace_get_fields(call);
685 return __find_event_field(head, name);
686}
687
688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 661static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
689{ 662{
690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); 663 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -777,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter,
777 750
778static void __free_preds(struct event_filter *filter) 751static void __free_preds(struct event_filter *filter)
779{ 752{
753 int i;
754
780 if (filter->preds) { 755 if (filter->preds) {
756 for (i = 0; i < filter->n_preds; i++)
757 kfree(filter->preds[i].ops);
781 kfree(filter->preds); 758 kfree(filter->preds);
782 filter->preds = NULL; 759 filter->preds = NULL;
783 } 760 }
@@ -1337,7 +1314,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1337 return NULL; 1314 return NULL;
1338 } 1315 }
1339 1316
1340 field = find_event_field(call, operand1); 1317 field = trace_find_event_field(call, operand1);
1341 if (!field) { 1318 if (!field) {
1342 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); 1319 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1343 return NULL; 1320 return NULL;
@@ -1907,16 +1884,17 @@ out_unlock:
1907 return err; 1884 return err;
1908} 1885}
1909 1886
1910int apply_subsystem_event_filter(struct event_subsystem *system, 1887int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1911 char *filter_string) 1888 char *filter_string)
1912{ 1889{
1890 struct event_subsystem *system = dir->subsystem;
1913 struct event_filter *filter; 1891 struct event_filter *filter;
1914 int err = 0; 1892 int err = 0;
1915 1893
1916 mutex_lock(&event_mutex); 1894 mutex_lock(&event_mutex);
1917 1895
1918 /* Make sure the system still has events */ 1896 /* Make sure the system still has events */
1919 if (!system->nr_events) { 1897 if (!dir->nr_events) {
1920 err = -ENODEV; 1898 err = -ENODEV;
1921 goto out_unlock; 1899 goto out_unlock;
1922 } 1900 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \
129 129
130#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
132int \ 132static int __init \
133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
134{ \ 134{ \
135 struct struct_name field; \ 135 struct struct_name field; \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \ 169 regfn) \
170 \ 170 \
171struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class __refdata event_class_ftrace_##call = { \
172 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
173 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
28static int function_trace_init(struct trace_array *tr) 28static int function_trace_init(struct trace_array *tr)
29{ 29{
30 func_trace = tr; 30 func_trace = tr;
31 tr->cpu = get_cpu(); 31 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 32 put_cpu();
33 33
34 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
44 44
45static void function_trace_start(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
46{ 46{
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(&tr->trace_buffer);
48} 48}
49 49
50/* Our option */ 50/* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
76 goto out; 76 goto out;
77 77
78 cpu = smp_processor_id(); 78 cpu = smp_processor_id();
79 data = tr->data[cpu]; 79 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
80 if (!atomic_read(&data->disabled)) { 80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags); 81 local_save_flags(flags);
82 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
107 */ 107 */
108 local_irq_save(flags); 108 local_irq_save(flags);
109 cpu = raw_smp_processor_id(); 109 cpu = raw_smp_processor_id();
110 data = tr->data[cpu]; 110 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
111 disabled = atomic_inc_return(&data->disabled); 111 disabled = atomic_inc_return(&data->disabled);
112 112
113 if (likely(disabled == 1)) { 113 if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
214}; 214};
215 215
216#ifdef CONFIG_DYNAMIC_FTRACE 216#ifdef CONFIG_DYNAMIC_FTRACE
217static void 217static int update_count(void **data)
218ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
219{ 218{
220 long *count = (long *)data; 219 unsigned long *count = (long *)data;
221
222 if (tracing_is_on())
223 return;
224 220
225 if (!*count) 221 if (!*count)
226 return; 222 return 0;
227 223
228 if (*count != -1) 224 if (*count != -1)
229 (*count)--; 225 (*count)--;
230 226
231 tracing_on(); 227 return 1;
232} 228}
233 229
234static void 230static void
235ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) 231ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
236{ 232{
237 long *count = (long *)data; 233 if (tracing_is_on())
234 return;
235
236 if (update_count(data))
237 tracing_on();
238}
238 239
240static void
241ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
242{
239 if (!tracing_is_on()) 243 if (!tracing_is_on())
240 return; 244 return;
241 245
242 if (!*count) 246 if (update_count(data))
247 tracing_off();
248}
249
250static void
251ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
252{
253 if (tracing_is_on())
243 return; 254 return;
244 255
245 if (*count != -1) 256 tracing_on();
246 (*count)--; 257}
258
259static void
260ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
261{
262 if (!tracing_is_on())
263 return;
247 264
248 tracing_off(); 265 tracing_off();
249} 266}
250 267
251static int 268/*
252ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 269 * Skip 4:
253 struct ftrace_probe_ops *ops, void *data); 270 * ftrace_stacktrace()
271 * function_trace_probe_call()
272 * ftrace_ops_list_func()
273 * ftrace_call()
274 */
275#define STACK_SKIP 4
254 276
255static struct ftrace_probe_ops traceon_probe_ops = { 277static void
256 .func = ftrace_traceon, 278ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
257 .print = ftrace_trace_onoff_print, 279{
258}; 280 trace_dump_stack(STACK_SKIP);
281}
259 282
260static struct ftrace_probe_ops traceoff_probe_ops = { 283static void
261 .func = ftrace_traceoff, 284ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
262 .print = ftrace_trace_onoff_print, 285{
263}; 286 if (!tracing_is_on())
287 return;
288
289 if (update_count(data))
290 trace_dump_stack(STACK_SKIP);
291}
264 292
265static int 293static int
266ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 294ftrace_probe_print(const char *name, struct seq_file *m,
267 struct ftrace_probe_ops *ops, void *data) 295 unsigned long ip, void *data)
268{ 296{
269 long count = (long)data; 297 long count = (long)data;
270 298
271 seq_printf(m, "%ps:", (void *)ip); 299 seq_printf(m, "%ps:%s", (void *)ip, name);
272
273 if (ops == &traceon_probe_ops)
274 seq_printf(m, "traceon");
275 else
276 seq_printf(m, "traceoff");
277 300
278 if (count == -1) 301 if (count == -1)
279 seq_printf(m, ":unlimited\n"); 302 seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
284} 307}
285 308
286static int 309static int
287ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) 310ftrace_traceon_print(struct seq_file *m, unsigned long ip,
311 struct ftrace_probe_ops *ops, void *data)
288{ 312{
289 struct ftrace_probe_ops *ops; 313 return ftrace_probe_print("traceon", m, ip, data);
290 314}
291 /* we register both traceon and traceoff to this callback */
292 if (strcmp(cmd, "traceon") == 0)
293 ops = &traceon_probe_ops;
294 else
295 ops = &traceoff_probe_ops;
296 315
297 unregister_ftrace_function_probe_func(glob, ops); 316static int
317ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
318 struct ftrace_probe_ops *ops, void *data)
319{
320 return ftrace_probe_print("traceoff", m, ip, data);
321}
298 322
299 return 0; 323static int
324ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
325 struct ftrace_probe_ops *ops, void *data)
326{
327 return ftrace_probe_print("stacktrace", m, ip, data);
300} 328}
301 329
330static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print,
333};
334
335static struct ftrace_probe_ops traceoff_count_probe_ops = {
336 .func = ftrace_traceoff_count,
337 .print = ftrace_traceoff_print,
338};
339
340static struct ftrace_probe_ops stacktrace_count_probe_ops = {
341 .func = ftrace_stacktrace_count,
342 .print = ftrace_stacktrace_print,
343};
344
345static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon,
347 .print = ftrace_traceon_print,
348};
349
350static struct ftrace_probe_ops traceoff_probe_ops = {
351 .func = ftrace_traceoff,
352 .print = ftrace_traceoff_print,
353};
354
355static struct ftrace_probe_ops stacktrace_probe_ops = {
356 .func = ftrace_stacktrace,
357 .print = ftrace_stacktrace_print,
358};
359
302static int 360static int
303ftrace_trace_onoff_callback(struct ftrace_hash *hash, 361ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
304 char *glob, char *cmd, char *param, int enable) 362 struct ftrace_hash *hash, char *glob,
363 char *cmd, char *param, int enable)
305{ 364{
306 struct ftrace_probe_ops *ops;
307 void *count = (void *)-1; 365 void *count = (void *)-1;
308 char *number; 366 char *number;
309 int ret; 367 int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
312 if (!enable) 370 if (!enable)
313 return -EINVAL; 371 return -EINVAL;
314 372
315 if (glob[0] == '!') 373 if (glob[0] == '!') {
316 return ftrace_trace_onoff_unreg(glob+1, cmd, param); 374 unregister_ftrace_function_probe_func(glob+1, ops);
317 375 return 0;
318 /* we register both traceon and traceoff to this callback */ 376 }
319 if (strcmp(cmd, "traceon") == 0)
320 ops = &traceon_probe_ops;
321 else
322 ops = &traceoff_probe_ops;
323 377
324 if (!param) 378 if (!param)
325 goto out_reg; 379 goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
343 return ret < 0 ? ret : 0; 397 return ret < 0 ? ret : 0;
344} 398}
345 399
400static int
401ftrace_trace_onoff_callback(struct ftrace_hash *hash,
402 char *glob, char *cmd, char *param, int enable)
403{
404 struct ftrace_probe_ops *ops;
405
406 /* we register both traceon and traceoff to this callback */
407 if (strcmp(cmd, "traceon") == 0)
408 ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
409 else
410 ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
411
412 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
413 param, enable);
414}
415
416static int
417ftrace_stacktrace_callback(struct ftrace_hash *hash,
418 char *glob, char *cmd, char *param, int enable)
419{
420 struct ftrace_probe_ops *ops;
421
422 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
423
424 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
425 param, enable);
426}
427
346static struct ftrace_func_command ftrace_traceon_cmd = { 428static struct ftrace_func_command ftrace_traceon_cmd = {
347 .name = "traceon", 429 .name = "traceon",
348 .func = ftrace_trace_onoff_callback, 430 .func = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
353 .func = ftrace_trace_onoff_callback, 435 .func = ftrace_trace_onoff_callback,
354}; 436};
355 437
438static struct ftrace_func_command ftrace_stacktrace_cmd = {
439 .name = "stacktrace",
440 .func = ftrace_stacktrace_callback,
441};
442
356static int __init init_func_cmd_traceon(void) 443static int __init init_func_cmd_traceon(void)
357{ 444{
358 int ret; 445 int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
364 ret = register_ftrace_command(&ftrace_traceon_cmd); 451 ret = register_ftrace_command(&ftrace_traceon_cmd);
365 if (ret) 452 if (ret)
366 unregister_ftrace_command(&ftrace_traceoff_cmd); 453 unregister_ftrace_command(&ftrace_traceoff_cmd);
454
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) {
457 unregister_ftrace_command(&ftrace_traceoff_cmd);
458 unregister_ftrace_command(&ftrace_traceon_cmd);
459 }
367 return ret; 460 return ret;
368} 461}
369#else 462#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
218{ 218{
219 struct ftrace_event_call *call = &event_funcgraph_entry; 219 struct ftrace_event_call *call = &event_funcgraph_entry;
220 struct ring_buffer_event *event; 220 struct ring_buffer_event *event;
221 struct ring_buffer *buffer = tr->buffer; 221 struct ring_buffer *buffer = tr->trace_buffer.buffer;
222 struct ftrace_graph_ent_entry *entry; 222 struct ftrace_graph_ent_entry *entry;
223 223
224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
265 265
266 local_irq_save(flags); 266 local_irq_save(flags);
267 cpu = raw_smp_processor_id(); 267 cpu = raw_smp_processor_id();
268 data = tr->data[cpu]; 268 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
269 disabled = atomic_inc_return(&data->disabled); 269 disabled = atomic_inc_return(&data->disabled);
270 if (likely(disabled == 1)) { 270 if (likely(disabled == 1)) {
271 pc = preempt_count(); 271 pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
323{ 323{
324 struct ftrace_event_call *call = &event_funcgraph_exit; 324 struct ftrace_event_call *call = &event_funcgraph_exit;
325 struct ring_buffer_event *event; 325 struct ring_buffer_event *event;
326 struct ring_buffer *buffer = tr->buffer; 326 struct ring_buffer *buffer = tr->trace_buffer.buffer;
327 struct ftrace_graph_ret_entry *entry; 327 struct ftrace_graph_ret_entry *entry;
328 328
329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
350 350
351 local_irq_save(flags); 351 local_irq_save(flags);
352 cpu = raw_smp_processor_id(); 352 cpu = raw_smp_processor_id();
353 data = tr->data[cpu]; 353 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
354 disabled = atomic_inc_return(&data->disabled); 354 disabled = atomic_inc_return(&data->disabled);
355 if (likely(disabled == 1)) { 355 if (likely(disabled == 1)) {
356 pc = preempt_count(); 356 pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
560 * We need to consume the current entry to see 560 * We need to consume the current entry to see
561 * the next one. 561 * the next one.
562 */ 562 */
563 ring_buffer_consume(iter->tr->buffer, iter->cpu, 563 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
564 NULL, NULL); 564 NULL, NULL);
565 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 565 event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
566 NULL, NULL); 566 NULL, NULL);
567 } 567 }
568 568
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac4881..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,8 @@ enum {
32 32
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_flags;
36static bool function_enabled;
36 37
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph); 38static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph); 39static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
121 if (!irqs_disabled_flags(*flags)) 122 if (!irqs_disabled_flags(*flags))
122 return 0; 123 return 0;
123 124
124 *data = tr->data[cpu]; 125 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
125 disabled = atomic_inc_return(&(*data)->disabled); 126 disabled = atomic_inc_return(&(*data)->disabled);
126 127
127 if (likely(disabled == 1)) 128 if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
175 per_cpu(tracing_cpu, cpu) = 0; 176 per_cpu(tracing_cpu, cpu) = 0;
176 177
177 tracing_max_latency = 0; 178 tracing_max_latency = 0;
178 tracing_reset_online_cpus(irqsoff_trace); 179 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
179 180
180 return start_irqsoff_tracer(irqsoff_trace, set); 181 return start_irqsoff_tracer(irqsoff_trace, set);
181} 182}
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
380 if (per_cpu(tracing_cpu, cpu)) 381 if (per_cpu(tracing_cpu, cpu))
381 return; 382 return;
382 383
383 data = tr->data[cpu]; 384 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
384 385
385 if (unlikely(!data) || atomic_read(&data->disabled)) 386 if (unlikely(!data) || atomic_read(&data->disabled))
386 return; 387 return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
418 if (!tracer_enabled) 419 if (!tracer_enabled)
419 return; 420 return;
420 421
421 data = tr->data[cpu]; 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
422 423
423 if (unlikely(!data) || 424 if (unlikely(!data) ||
424 !data->critical_start || atomic_read(&data->disabled)) 425 !data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
528} 529}
529#endif /* CONFIG_PREEMPT_TRACER */ 530#endif /* CONFIG_PREEMPT_TRACER */
530 531
531static int start_irqsoff_tracer(struct trace_array *tr, int graph) 532static int register_irqsoff_function(int graph, int set)
532{ 533{
533 int ret = 0; 534 int ret;
534 535
535 if (!graph) 536 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
536 ret = register_ftrace_function(&trace_ops); 537 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
537 else 538 return 0;
539
540 if (graph)
538 ret = register_ftrace_graph(&irqsoff_graph_return, 541 ret = register_ftrace_graph(&irqsoff_graph_return,
539 &irqsoff_graph_entry); 542 &irqsoff_graph_entry);
543 else
544 ret = register_ftrace_function(&trace_ops);
545
546 if (!ret)
547 function_enabled = true;
548
549 return ret;
550}
551
552static void unregister_irqsoff_function(int graph)
553{
554 if (!function_enabled)
555 return;
556
557 if (graph)
558 unregister_ftrace_graph();
559 else
560 unregister_ftrace_function(&trace_ops);
561
562 function_enabled = false;
563}
564
565static void irqsoff_function_set(int set)
566{
567 if (set)
568 register_irqsoff_function(is_graph(), 1);
569 else
570 unregister_irqsoff_function(is_graph());
571}
572
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
574{
575 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set);
577
578 return trace_keep_overwrite(tracer, mask, set);
579}
580
581static int start_irqsoff_tracer(struct trace_array *tr, int graph)
582{
583 int ret;
584
585 ret = register_irqsoff_function(graph, 0);
540 586
541 if (!ret && tracing_is_enabled()) 587 if (!ret && tracing_is_enabled())
542 tracer_enabled = 1; 588 tracer_enabled = 1;
@@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
550{ 596{
551 tracer_enabled = 0; 597 tracer_enabled = 0;
552 598
553 if (!graph) 599 unregister_irqsoff_function(graph);
554 unregister_ftrace_function(&trace_ops);
555 else
556 unregister_ftrace_graph();
557} 600}
558 601
559static void __irqsoff_tracer_init(struct trace_array *tr) 602static void __irqsoff_tracer_init(struct trace_array *tr)
560{ 603{
561 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; 604 save_flags = trace_flags;
562 trace_flags |= TRACE_ITER_LATENCY_FMT; 605
606 /* non overwrite screws up the latency tracers */
607 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
608 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
563 609
564 tracing_max_latency = 0; 610 tracing_max_latency = 0;
565 irqsoff_trace = tr; 611 irqsoff_trace = tr;
566 /* make sure that the tracer is visible */ 612 /* make sure that the tracer is visible */
567 smp_wmb(); 613 smp_wmb();
568 tracing_reset_online_cpus(tr); 614 tracing_reset_online_cpus(&tr->trace_buffer);
569 615
570 if (start_irqsoff_tracer(tr, is_graph())) 616 if (start_irqsoff_tracer(tr, is_graph()))
571 printk(KERN_ERR "failed to start irqsoff tracer\n"); 617 printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
573 619
574static void irqsoff_tracer_reset(struct trace_array *tr) 620static void irqsoff_tracer_reset(struct trace_array *tr)
575{ 621{
622 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
623 int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
624
576 stop_irqsoff_tracer(tr, is_graph()); 625 stop_irqsoff_tracer(tr, is_graph());
577 626
578 if (!save_lat_flag) 627 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
579 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 628 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
580} 629}
581 630
582static void irqsoff_tracer_start(struct trace_array *tr) 631static void irqsoff_tracer_start(struct trace_array *tr)
@@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
609 .print_line = irqsoff_print_line, 658 .print_line = irqsoff_print_line,
610 .flags = &tracer_flags, 659 .flags = &tracer_flags,
611 .set_flag = irqsoff_set_flag, 660 .set_flag = irqsoff_set_flag,
661 .flag_changed = irqsoff_flag_changed,
612#ifdef CONFIG_FTRACE_SELFTEST 662#ifdef CONFIG_FTRACE_SELFTEST
613 .selftest = trace_selftest_startup_irqsoff, 663 .selftest = trace_selftest_startup_irqsoff,
614#endif 664#endif
@@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
642 .print_line = irqsoff_print_line, 692 .print_line = irqsoff_print_line,
643 .flags = &tracer_flags, 693 .flags = &tracer_flags,
644 .set_flag = irqsoff_set_flag, 694 .set_flag = irqsoff_set_flag,
695 .flag_changed = irqsoff_flag_changed,
645#ifdef CONFIG_FTRACE_SELFTEST 696#ifdef CONFIG_FTRACE_SELFTEST
646 .selftest = trace_selftest_startup_preemptoff, 697 .selftest = trace_selftest_startup_preemptoff,
647#endif 698#endif
@@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
677 .print_line = irqsoff_print_line, 728 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags, 729 .flags = &tracer_flags,
679 .set_flag = irqsoff_set_flag, 730 .set_flag = irqsoff_set_flag,
731 .flag_changed = irqsoff_flag_changed,
680#ifdef CONFIG_FTRACE_SELFTEST 732#ifdef CONFIG_FTRACE_SELFTEST
681 .selftest = trace_selftest_startup_preemptirqsoff, 733 .selftest = trace_selftest_startup_preemptirqsoff,
682#endif 734#endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
26 trace_init_global_iter(&iter); 26 trace_init_global_iter(&iter);
27 27
28 for_each_tracing_cpu(cpu) { 28 for_each_tracing_cpu(cpu) {
29 atomic_inc(&iter.tr->data[cpu]->disabled); 29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
30 } 30 }
31 31
32 old_userobj = trace_flags; 32 old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
43 iter.iter_flags |= TRACE_FILE_LAT_FMT; 43 iter.iter_flags |= TRACE_FILE_LAT_FMT;
44 iter.pos = -1; 44 iter.pos = -1;
45 45
46 if (cpu_file == TRACE_PIPE_ALL_CPU) { 46 if (cpu_file == RING_BUFFER_ALL_CPUS) {
47 for_each_tracing_cpu(cpu) { 47 for_each_tracing_cpu(cpu) {
48 iter.buffer_iter[cpu] = 48 iter.buffer_iter[cpu] =
49 ring_buffer_read_prepare(iter.tr->buffer, cpu); 49 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
50 ring_buffer_read_start(iter.buffer_iter[cpu]); 50 ring_buffer_read_start(iter.buffer_iter[cpu]);
51 tracing_iter_reset(&iter, cpu); 51 tracing_iter_reset(&iter, cpu);
52 } 52 }
53 } else { 53 } else {
54 iter.cpu_file = cpu_file; 54 iter.cpu_file = cpu_file;
55 iter.buffer_iter[cpu_file] = 55 iter.buffer_iter[cpu_file] =
56 ring_buffer_read_prepare(iter.tr->buffer, cpu_file); 56 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 57 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 58 tracing_iter_reset(&iter, cpu_file);
59 } 59 }
@@ -83,7 +83,7 @@ out:
83 trace_flags = old_userobj; 83 trace_flags = old_userobj;
84 84
85 for_each_tracing_cpu(cpu) { 85 for_each_tracing_cpu(cpu) {
86 atomic_dec(&iter.tr->data[cpu]->disabled); 86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 87 }
88 88
89 for_each_tracing_cpu(cpu) 89 for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
115 !cpu_online(cpu_file)) 115 !cpu_online(cpu_file))
116 return KDB_BADINT; 116 return KDB_BADINT;
117 } else { 117 } else {
118 cpu_file = TRACE_PIPE_ALL_CPU; 118 cpu_file = RING_BUFFER_ALL_CPUS;
119 } 119 }
120 120
121 kdb_trap_printk++; 121 kdb_trap_printk++;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1865d5f76538..9f46e98ba8f2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -27,7 +27,6 @@
27/** 27/**
28 * Kprobe event core functions 28 * Kprobe event core functions
29 */ 29 */
30
31struct trace_probe { 30struct trace_probe {
32 struct list_head list; 31 struct list_head list;
33 struct kretprobe rp; /* Use rp.kp for kprobe use */ 32 struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -36,6 +35,7 @@ struct trace_probe {
36 const char *symbol; /* symbol name */ 35 const char *symbol; /* symbol name */
37 struct ftrace_event_class class; 36 struct ftrace_event_class class;
38 struct ftrace_event_call call; 37 struct ftrace_event_call call;
38 struct ftrace_event_file * __rcu *files;
39 ssize_t size; /* trace entry size */ 39 ssize_t size; /* trace entry size */
40 unsigned int nr_args; 40 unsigned int nr_args;
41 struct probe_arg args[]; 41 struct probe_arg args[];
@@ -46,7 +46,7 @@ struct trace_probe {
46 (sizeof(struct probe_arg) * (n))) 46 (sizeof(struct probe_arg) * (n)))
47 47
48 48
49static __kprobes int trace_probe_is_return(struct trace_probe *tp) 49static __kprobes bool trace_probe_is_return(struct trace_probe *tp)
50{ 50{
51 return tp->rp.handler != NULL; 51 return tp->rp.handler != NULL;
52} 52}
@@ -183,12 +183,63 @@ static struct trace_probe *find_trace_probe(const char *event,
183 return NULL; 183 return NULL;
184} 184}
185 185
186/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ 186static int trace_probe_nr_files(struct trace_probe *tp)
187static int enable_trace_probe(struct trace_probe *tp, int flag) 187{
188 struct ftrace_event_file **file;
189 int ret = 0;
190
191 /*
192 * Since all tp->files updater is protected by probe_enable_lock,
193 * we don't need to lock an rcu_read_lock.
194 */
195 file = rcu_dereference_raw(tp->files);
196 if (file)
197 while (*(file++))
198 ret++;
199
200 return ret;
201}
202
203static DEFINE_MUTEX(probe_enable_lock);
204
205/*
206 * Enable trace_probe
207 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
208 */
209static int
210enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
188{ 211{
189 int ret = 0; 212 int ret = 0;
190 213
191 tp->flags |= flag; 214 mutex_lock(&probe_enable_lock);
215
216 if (file) {
217 struct ftrace_event_file **new, **old;
218 int n = trace_probe_nr_files(tp);
219
220 old = rcu_dereference_raw(tp->files);
221 /* 1 is for new one and 1 is for stopper */
222 new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
223 GFP_KERNEL);
224 if (!new) {
225 ret = -ENOMEM;
226 goto out_unlock;
227 }
228 memcpy(new, old, n * sizeof(struct ftrace_event_file *));
229 new[n] = file;
230 /* The last one keeps a NULL */
231
232 rcu_assign_pointer(tp->files, new);
233 tp->flags |= TP_FLAG_TRACE;
234
235 if (old) {
236 /* Make sure the probe is done with old files */
237 synchronize_sched();
238 kfree(old);
239 }
240 } else
241 tp->flags |= TP_FLAG_PROFILE;
242
192 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && 243 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
193 !trace_probe_has_gone(tp)) { 244 !trace_probe_has_gone(tp)) {
194 if (trace_probe_is_return(tp)) 245 if (trace_probe_is_return(tp))
@@ -197,19 +248,90 @@ static int enable_trace_probe(struct trace_probe *tp, int flag)
197 ret = enable_kprobe(&tp->rp.kp); 248 ret = enable_kprobe(&tp->rp.kp);
198 } 249 }
199 250
251 out_unlock:
252 mutex_unlock(&probe_enable_lock);
253
200 return ret; 254 return ret;
201} 255}
202 256
203/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ 257static int
204static void disable_trace_probe(struct trace_probe *tp, int flag) 258trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
259{
260 struct ftrace_event_file **files;
261 int i;
262
263 /*
264 * Since all tp->files updater is protected by probe_enable_lock,
265 * we don't need to lock an rcu_read_lock.
266 */
267 files = rcu_dereference_raw(tp->files);
268 if (files) {
269 for (i = 0; files[i]; i++)
270 if (files[i] == file)
271 return i;
272 }
273
274 return -1;
275}
276
277/*
278 * Disable trace_probe
279 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
280 */
281static int
282disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
205{ 283{
206 tp->flags &= ~flag; 284 int ret = 0;
285
286 mutex_lock(&probe_enable_lock);
287
288 if (file) {
289 struct ftrace_event_file **new, **old;
290 int n = trace_probe_nr_files(tp);
291 int i, j;
292
293 old = rcu_dereference_raw(tp->files);
294 if (n == 0 || trace_probe_file_index(tp, file) < 0) {
295 ret = -EINVAL;
296 goto out_unlock;
297 }
298
299 if (n == 1) { /* Remove the last file */
300 tp->flags &= ~TP_FLAG_TRACE;
301 new = NULL;
302 } else {
303 new = kzalloc(n * sizeof(struct ftrace_event_file *),
304 GFP_KERNEL);
305 if (!new) {
306 ret = -ENOMEM;
307 goto out_unlock;
308 }
309
310 /* This copy & check loop copies the NULL stopper too */
311 for (i = 0, j = 0; j < n && i < n + 1; i++)
312 if (old[i] != file)
313 new[j++] = old[i];
314 }
315
316 rcu_assign_pointer(tp->files, new);
317
318 /* Make sure the probe is done with old files */
319 synchronize_sched();
320 kfree(old);
321 } else
322 tp->flags &= ~TP_FLAG_PROFILE;
323
207 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { 324 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
208 if (trace_probe_is_return(tp)) 325 if (trace_probe_is_return(tp))
209 disable_kretprobe(&tp->rp); 326 disable_kretprobe(&tp->rp);
210 else 327 else
211 disable_kprobe(&tp->rp.kp); 328 disable_kprobe(&tp->rp.kp);
212 } 329 }
330
331 out_unlock:
332 mutex_unlock(&probe_enable_lock);
333
334 return ret;
213} 335}
214 336
215/* Internal register function - just handle k*probes and flags */ 337/* Internal register function - just handle k*probes and flags */
@@ -723,9 +845,10 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
723} 845}
724 846
725/* Kprobe handler */ 847/* Kprobe handler */
726static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 848static __kprobes void
849__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
850 struct ftrace_event_file *ftrace_file)
727{ 851{
728 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
729 struct kprobe_trace_entry_head *entry; 852 struct kprobe_trace_entry_head *entry;
730 struct ring_buffer_event *event; 853 struct ring_buffer_event *event;
731 struct ring_buffer *buffer; 854 struct ring_buffer *buffer;
@@ -733,7 +856,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
733 unsigned long irq_flags; 856 unsigned long irq_flags;
734 struct ftrace_event_call *call = &tp->call; 857 struct ftrace_event_call *call = &tp->call;
735 858
736 tp->nhit++; 859 WARN_ON(call != ftrace_file->event_call);
860
861 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
862 return;
737 863
738 local_save_flags(irq_flags); 864 local_save_flags(irq_flags);
739 pc = preempt_count(); 865 pc = preempt_count();
@@ -741,13 +867,14 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
741 dsize = __get_data_size(tp, regs); 867 dsize = __get_data_size(tp, regs);
742 size = sizeof(*entry) + tp->size + dsize; 868 size = sizeof(*entry) + tp->size + dsize;
743 869
744 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 870 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
745 size, irq_flags, pc); 871 call->event.type,
872 size, irq_flags, pc);
746 if (!event) 873 if (!event)
747 return; 874 return;
748 875
749 entry = ring_buffer_event_data(event); 876 entry = ring_buffer_event_data(event);
750 entry->ip = (unsigned long)kp->addr; 877 entry->ip = (unsigned long)tp->rp.kp.addr;
751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 878 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
752 879
753 if (!filter_current_check_discard(buffer, call, entry, event)) 880 if (!filter_current_check_discard(buffer, call, entry, event))
@@ -755,11 +882,31 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
755 irq_flags, pc, regs); 882 irq_flags, pc, regs);
756} 883}
757 884
885static __kprobes void
886kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
887{
888 /*
889 * Note: preempt is already disabled around the kprobe handler.
890 * However, we still need an smp_read_barrier_depends() corresponding
891 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
892 */
893 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
894
895 if (unlikely(!file))
896 return;
897
898 while (*file) {
899 __kprobe_trace_func(tp, regs, *file);
900 file++;
901 }
902}
903
758/* Kretprobe handler */ 904/* Kretprobe handler */
759static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, 905static __kprobes void
760 struct pt_regs *regs) 906__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
907 struct pt_regs *regs,
908 struct ftrace_event_file *ftrace_file)
761{ 909{
762 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
763 struct kretprobe_trace_entry_head *entry; 910 struct kretprobe_trace_entry_head *entry;
764 struct ring_buffer_event *event; 911 struct ring_buffer_event *event;
765 struct ring_buffer *buffer; 912 struct ring_buffer *buffer;
@@ -767,14 +914,20 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
767 unsigned long irq_flags; 914 unsigned long irq_flags;
768 struct ftrace_event_call *call = &tp->call; 915 struct ftrace_event_call *call = &tp->call;
769 916
917 WARN_ON(call != ftrace_file->event_call);
918
919 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
920 return;
921
770 local_save_flags(irq_flags); 922 local_save_flags(irq_flags);
771 pc = preempt_count(); 923 pc = preempt_count();
772 924
773 dsize = __get_data_size(tp, regs); 925 dsize = __get_data_size(tp, regs);
774 size = sizeof(*entry) + tp->size + dsize; 926 size = sizeof(*entry) + tp->size + dsize;
775 927
776 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 928 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
777 size, irq_flags, pc); 929 call->event.type,
930 size, irq_flags, pc);
778 if (!event) 931 if (!event)
779 return; 932 return;
780 933
@@ -788,8 +941,28 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
788 irq_flags, pc, regs); 941 irq_flags, pc, regs);
789} 942}
790 943
944static __kprobes void
945kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
946 struct pt_regs *regs)
947{
948 /*
949 * Note: preempt is already disabled around the kprobe handler.
950 * However, we still need an smp_read_barrier_depends() corresponding
951 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
952 */
953 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
954
955 if (unlikely(!file))
956 return;
957
958 while (*file) {
959 __kretprobe_trace_func(tp, ri, regs, *file);
960 file++;
961 }
962}
963
791/* Event entry printers */ 964/* Event entry printers */
792enum print_line_t 965static enum print_line_t
793print_kprobe_event(struct trace_iterator *iter, int flags, 966print_kprobe_event(struct trace_iterator *iter, int flags,
794 struct trace_event *event) 967 struct trace_event *event)
795{ 968{
@@ -825,7 +998,7 @@ partial:
825 return TRACE_TYPE_PARTIAL_LINE; 998 return TRACE_TYPE_PARTIAL_LINE;
826} 999}
827 1000
828enum print_line_t 1001static enum print_line_t
829print_kretprobe_event(struct trace_iterator *iter, int flags, 1002print_kretprobe_event(struct trace_iterator *iter, int flags,
830 struct trace_event *event) 1003 struct trace_event *event)
831{ 1004{
@@ -975,10 +1148,9 @@ static int set_print_fmt(struct trace_probe *tp)
975#ifdef CONFIG_PERF_EVENTS 1148#ifdef CONFIG_PERF_EVENTS
976 1149
977/* Kprobe profile handler */ 1150/* Kprobe profile handler */
978static __kprobes void kprobe_perf_func(struct kprobe *kp, 1151static __kprobes void
979 struct pt_regs *regs) 1152kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
980{ 1153{
981 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
982 struct ftrace_event_call *call = &tp->call; 1154 struct ftrace_event_call *call = &tp->call;
983 struct kprobe_trace_entry_head *entry; 1155 struct kprobe_trace_entry_head *entry;
984 struct hlist_head *head; 1156 struct hlist_head *head;
@@ -997,7 +1169,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
997 if (!entry) 1169 if (!entry)
998 return; 1170 return;
999 1171
1000 entry->ip = (unsigned long)kp->addr; 1172 entry->ip = (unsigned long)tp->rp.kp.addr;
1001 memset(&entry[1], 0, dsize); 1173 memset(&entry[1], 0, dsize);
1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1174 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1003 1175
@@ -1007,10 +1179,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1007} 1179}
1008 1180
1009/* Kretprobe profile handler */ 1181/* Kretprobe profile handler */
1010static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, 1182static __kprobes void
1011 struct pt_regs *regs) 1183kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1184 struct pt_regs *regs)
1012{ 1185{
1013 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1014 struct ftrace_event_call *call = &tp->call; 1186 struct ftrace_event_call *call = &tp->call;
1015 struct kretprobe_trace_entry_head *entry; 1187 struct kretprobe_trace_entry_head *entry;
1016 struct hlist_head *head; 1188 struct hlist_head *head;
@@ -1044,20 +1216,19 @@ int kprobe_register(struct ftrace_event_call *event,
1044 enum trace_reg type, void *data) 1216 enum trace_reg type, void *data)
1045{ 1217{
1046 struct trace_probe *tp = (struct trace_probe *)event->data; 1218 struct trace_probe *tp = (struct trace_probe *)event->data;
1219 struct ftrace_event_file *file = data;
1047 1220
1048 switch (type) { 1221 switch (type) {
1049 case TRACE_REG_REGISTER: 1222 case TRACE_REG_REGISTER:
1050 return enable_trace_probe(tp, TP_FLAG_TRACE); 1223 return enable_trace_probe(tp, file);
1051 case TRACE_REG_UNREGISTER: 1224 case TRACE_REG_UNREGISTER:
1052 disable_trace_probe(tp, TP_FLAG_TRACE); 1225 return disable_trace_probe(tp, file);
1053 return 0;
1054 1226
1055#ifdef CONFIG_PERF_EVENTS 1227#ifdef CONFIG_PERF_EVENTS
1056 case TRACE_REG_PERF_REGISTER: 1228 case TRACE_REG_PERF_REGISTER:
1057 return enable_trace_probe(tp, TP_FLAG_PROFILE); 1229 return enable_trace_probe(tp, NULL);
1058 case TRACE_REG_PERF_UNREGISTER: 1230 case TRACE_REG_PERF_UNREGISTER:
1059 disable_trace_probe(tp, TP_FLAG_PROFILE); 1231 return disable_trace_probe(tp, NULL);
1060 return 0;
1061 case TRACE_REG_PERF_OPEN: 1232 case TRACE_REG_PERF_OPEN:
1062 case TRACE_REG_PERF_CLOSE: 1233 case TRACE_REG_PERF_CLOSE:
1063 case TRACE_REG_PERF_ADD: 1234 case TRACE_REG_PERF_ADD:
@@ -1073,11 +1244,13 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1073{ 1244{
1074 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1245 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1075 1246
1247 tp->nhit++;
1248
1076 if (tp->flags & TP_FLAG_TRACE) 1249 if (tp->flags & TP_FLAG_TRACE)
1077 kprobe_trace_func(kp, regs); 1250 kprobe_trace_func(tp, regs);
1078#ifdef CONFIG_PERF_EVENTS 1251#ifdef CONFIG_PERF_EVENTS
1079 if (tp->flags & TP_FLAG_PROFILE) 1252 if (tp->flags & TP_FLAG_PROFILE)
1080 kprobe_perf_func(kp, regs); 1253 kprobe_perf_func(tp, regs);
1081#endif 1254#endif
1082 return 0; /* We don't tweek kernel, so just return 0 */ 1255 return 0; /* We don't tweek kernel, so just return 0 */
1083} 1256}
@@ -1087,11 +1260,13 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1087{ 1260{
1088 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1261 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1089 1262
1263 tp->nhit++;
1264
1090 if (tp->flags & TP_FLAG_TRACE) 1265 if (tp->flags & TP_FLAG_TRACE)
1091 kretprobe_trace_func(ri, regs); 1266 kretprobe_trace_func(tp, ri, regs);
1092#ifdef CONFIG_PERF_EVENTS 1267#ifdef CONFIG_PERF_EVENTS
1093 if (tp->flags & TP_FLAG_PROFILE) 1268 if (tp->flags & TP_FLAG_PROFILE)
1094 kretprobe_perf_func(ri, regs); 1269 kretprobe_perf_func(tp, ri, regs);
1095#endif 1270#endif
1096 return 0; /* We don't tweek kernel, so just return 0 */ 1271 return 0; /* We don't tweek kernel, so just return 0 */
1097} 1272}
@@ -1189,11 +1364,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
1189 return a1 + a2 + a3 + a4 + a5 + a6; 1364 return a1 + a2 + a3 + a4 + a5 + a6;
1190} 1365}
1191 1366
1367static struct ftrace_event_file *
1368find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
1369{
1370 struct ftrace_event_file *file;
1371
1372 list_for_each_entry(file, &tr->events, list)
1373 if (file->event_call == &tp->call)
1374 return file;
1375
1376 return NULL;
1377}
1378
1192static __init int kprobe_trace_self_tests_init(void) 1379static __init int kprobe_trace_self_tests_init(void)
1193{ 1380{
1194 int ret, warn = 0; 1381 int ret, warn = 0;
1195 int (*target)(int, int, int, int, int, int); 1382 int (*target)(int, int, int, int, int, int);
1196 struct trace_probe *tp; 1383 struct trace_probe *tp;
1384 struct ftrace_event_file *file;
1197 1385
1198 target = kprobe_trace_selftest_target; 1386 target = kprobe_trace_selftest_target;
1199 1387
@@ -1203,31 +1391,43 @@ static __init int kprobe_trace_self_tests_init(void)
1203 "$stack $stack0 +0($stack)", 1391 "$stack $stack0 +0($stack)",
1204 create_trace_probe); 1392 create_trace_probe);
1205 if (WARN_ON_ONCE(ret)) { 1393 if (WARN_ON_ONCE(ret)) {
1206 pr_warning("error on probing function entry.\n"); 1394 pr_warn("error on probing function entry.\n");
1207 warn++; 1395 warn++;
1208 } else { 1396 } else {
1209 /* Enable trace point */ 1397 /* Enable trace point */
1210 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); 1398 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1211 if (WARN_ON_ONCE(tp == NULL)) { 1399 if (WARN_ON_ONCE(tp == NULL)) {
1212 pr_warning("error on getting new probe.\n"); 1400 pr_warn("error on getting new probe.\n");
1213 warn++; 1401 warn++;
1214 } else 1402 } else {
1215 enable_trace_probe(tp, TP_FLAG_TRACE); 1403 file = find_trace_probe_file(tp, top_trace_array());
1404 if (WARN_ON_ONCE(file == NULL)) {
1405 pr_warn("error on getting probe file.\n");
1406 warn++;
1407 } else
1408 enable_trace_probe(tp, file);
1409 }
1216 } 1410 }
1217 1411
1218 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " 1412 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
1219 "$retval", create_trace_probe); 1413 "$retval", create_trace_probe);
1220 if (WARN_ON_ONCE(ret)) { 1414 if (WARN_ON_ONCE(ret)) {
1221 pr_warning("error on probing function return.\n"); 1415 pr_warn("error on probing function return.\n");
1222 warn++; 1416 warn++;
1223 } else { 1417 } else {
1224 /* Enable trace point */ 1418 /* Enable trace point */
1225 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); 1419 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1226 if (WARN_ON_ONCE(tp == NULL)) { 1420 if (WARN_ON_ONCE(tp == NULL)) {
1227 pr_warning("error on getting new probe.\n"); 1421 pr_warn("error on getting 2nd new probe.\n");
1228 warn++; 1422 warn++;
1229 } else 1423 } else {
1230 enable_trace_probe(tp, TP_FLAG_TRACE); 1424 file = find_trace_probe_file(tp, top_trace_array());
1425 if (WARN_ON_ONCE(file == NULL)) {
1426 pr_warn("error on getting probe file.\n");
1427 warn++;
1428 } else
1429 enable_trace_probe(tp, file);
1430 }
1231 } 1431 }
1232 1432
1233 if (warn) 1433 if (warn)
@@ -1238,27 +1438,39 @@ static __init int kprobe_trace_self_tests_init(void)
1238 /* Disable trace points before removing it */ 1438 /* Disable trace points before removing it */
1239 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); 1439 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1240 if (WARN_ON_ONCE(tp == NULL)) { 1440 if (WARN_ON_ONCE(tp == NULL)) {
1241 pr_warning("error on getting test probe.\n"); 1441 pr_warn("error on getting test probe.\n");
1242 warn++; 1442 warn++;
1243 } else 1443 } else {
1244 disable_trace_probe(tp, TP_FLAG_TRACE); 1444 file = find_trace_probe_file(tp, top_trace_array());
1445 if (WARN_ON_ONCE(file == NULL)) {
1446 pr_warn("error on getting probe file.\n");
1447 warn++;
1448 } else
1449 disable_trace_probe(tp, file);
1450 }
1245 1451
1246 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); 1452 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1247 if (WARN_ON_ONCE(tp == NULL)) { 1453 if (WARN_ON_ONCE(tp == NULL)) {
1248 pr_warning("error on getting 2nd test probe.\n"); 1454 pr_warn("error on getting 2nd test probe.\n");
1249 warn++; 1455 warn++;
1250 } else 1456 } else {
1251 disable_trace_probe(tp, TP_FLAG_TRACE); 1457 file = find_trace_probe_file(tp, top_trace_array());
1458 if (WARN_ON_ONCE(file == NULL)) {
1459 pr_warn("error on getting probe file.\n");
1460 warn++;
1461 } else
1462 disable_trace_probe(tp, file);
1463 }
1252 1464
1253 ret = traceprobe_command("-:testprobe", create_trace_probe); 1465 ret = traceprobe_command("-:testprobe", create_trace_probe);
1254 if (WARN_ON_ONCE(ret)) { 1466 if (WARN_ON_ONCE(ret)) {
1255 pr_warning("error on deleting a probe.\n"); 1467 pr_warn("error on deleting a probe.\n");
1256 warn++; 1468 warn++;
1257 } 1469 }
1258 1470
1259 ret = traceprobe_command("-:testprobe2", create_trace_probe); 1471 ret = traceprobe_command("-:testprobe2", create_trace_probe);
1260 if (WARN_ON_ONCE(ret)) { 1472 if (WARN_ON_ONCE(ret)) {
1261 pr_warning("error on deleting a probe.\n"); 1473 pr_warn("error on deleting a probe.\n");
1262 warn++; 1474 warn++;
1263 } 1475 }
1264 1476
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
31 overrun_detected = false; 31 overrun_detected = false;
32 prev_overruns = 0; 32 prev_overruns = 0;
33 33
34 tracing_reset_online_cpus(tr); 34 tracing_reset_online_cpus(&tr->trace_buffer);
35} 35}
36 36
37static int mmio_trace_init(struct trace_array *tr) 37static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
128static unsigned long count_overruns(struct trace_iterator *iter) 128static unsigned long count_overruns(struct trace_iterator *iter)
129{ 129{
130 unsigned long cnt = atomic_xchg(&dropped_count, 0); 130 unsigned long cnt = atomic_xchg(&dropped_count, 0);
131 unsigned long over = ring_buffer_overruns(iter->tr->buffer); 131 unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
132 132
133 if (over > prev_overruns) 133 if (over > prev_overruns)
134 cnt += over - prev_overruns; 134 cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
309 struct mmiotrace_rw *rw) 309 struct mmiotrace_rw *rw)
310{ 310{
311 struct ftrace_event_call *call = &event_mmiotrace_rw; 311 struct ftrace_event_call *call = &event_mmiotrace_rw;
312 struct ring_buffer *buffer = tr->buffer; 312 struct ring_buffer *buffer = tr->trace_buffer.buffer;
313 struct ring_buffer_event *event; 313 struct ring_buffer_event *event;
314 struct trace_mmiotrace_rw *entry; 314 struct trace_mmiotrace_rw *entry;
315 int pc = preempt_count(); 315 int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
330void mmio_trace_rw(struct mmiotrace_rw *rw) 330void mmio_trace_rw(struct mmiotrace_rw *rw)
331{ 331{
332 struct trace_array *tr = mmio_trace_array; 332 struct trace_array *tr = mmio_trace_array;
333 struct trace_array_cpu *data = tr->data[smp_processor_id()]; 333 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
334 __trace_mmiotrace_rw(tr, data, rw); 334 __trace_mmiotrace_rw(tr, data, rw);
335} 335}
336 336
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
339 struct mmiotrace_map *map) 339 struct mmiotrace_map *map)
340{ 340{
341 struct ftrace_event_call *call = &event_mmiotrace_map; 341 struct ftrace_event_call *call = &event_mmiotrace_map;
342 struct ring_buffer *buffer = tr->buffer; 342 struct ring_buffer *buffer = tr->trace_buffer.buffer;
343 struct ring_buffer_event *event; 343 struct ring_buffer_event *event;
344 struct trace_mmiotrace_map *entry; 344 struct trace_mmiotrace_map *entry;
345 int pc = preempt_count(); 345 int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
363 struct trace_array_cpu *data; 363 struct trace_array_cpu *data;
364 364
365 preempt_disable(); 365 preempt_disable();
366 data = tr->data[smp_processor_id()]; 366 data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
367 __trace_mmiotrace_map(tr, data, map); 367 __trace_mmiotrace_map(tr, data, map);
368 preempt_enable(); 368 preempt_enable();
369} 369}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 697e88d13907..bb922d9ee51b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_sem);
18 18
19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
20 20
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
37 return ret; 37 return ret;
38} 38}
39 39
40enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
41{
42 struct trace_seq *s = &iter->seq;
43 struct trace_entry *entry = iter->ent;
44 struct bputs_entry *field;
45 int ret;
46
47 trace_assign_type(field, entry);
48
49 ret = trace_seq_puts(s, field->str);
50 if (!ret)
51 return TRACE_TYPE_PARTIAL_LINE;
52
53 return TRACE_TYPE_HANDLED;
54}
55
40enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 56enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41{ 57{
42 struct trace_seq *s = &iter->seq; 58 struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
397} 413}
398EXPORT_SYMBOL(ftrace_print_hex_seq); 414EXPORT_SYMBOL(ftrace_print_hex_seq);
399 415
416int ftrace_raw_output_prep(struct trace_iterator *iter,
417 struct trace_event *trace_event)
418{
419 struct ftrace_event_call *event;
420 struct trace_seq *s = &iter->seq;
421 struct trace_seq *p = &iter->tmp_seq;
422 struct trace_entry *entry;
423 int ret;
424
425 event = container_of(trace_event, struct ftrace_event_call, event);
426 entry = iter->ent;
427
428 if (entry->type != event->event.type) {
429 WARN_ON_ONCE(1);
430 return TRACE_TYPE_UNHANDLED;
431 }
432
433 trace_seq_init(p);
434 ret = trace_seq_printf(s, "%s: ", event->name);
435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE;
437
438 return 0;
439}
440EXPORT_SYMBOL(ftrace_raw_output_prep);
441
400#ifdef CONFIG_KRETPROBES 442#ifdef CONFIG_KRETPROBES
401static inline const char *kretprobed(const char *name) 443static inline const char *kretprobed(const char *name)
402{ 444{
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617{ 659{
618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; 660 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; 661 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
620 unsigned long long abs_ts = iter->ts - iter->tr->time_start; 662 unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts; 663 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq; 664 struct trace_seq *s = &iter->seq;
623 665
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
783 825
784void trace_event_read_lock(void) 826void trace_event_read_lock(void)
785{ 827{
786 down_read(&trace_event_mutex); 828 down_read(&trace_event_sem);
787} 829}
788 830
789void trace_event_read_unlock(void) 831void trace_event_read_unlock(void)
790{ 832{
791 up_read(&trace_event_mutex); 833 up_read(&trace_event_sem);
792} 834}
793 835
794/** 836/**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
811 unsigned key; 853 unsigned key;
812 int ret = 0; 854 int ret = 0;
813 855
814 down_write(&trace_event_mutex); 856 down_write(&trace_event_sem);
815 857
816 if (WARN_ON(!event)) 858 if (WARN_ON(!event))
817 goto out; 859 goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
866 908
867 ret = event->type; 909 ret = event->type;
868 out: 910 out:
869 up_write(&trace_event_mutex); 911 up_write(&trace_event_sem);
870 912
871 return ret; 913 return ret;
872} 914}
873EXPORT_SYMBOL_GPL(register_ftrace_event); 915EXPORT_SYMBOL_GPL(register_ftrace_event);
874 916
875/* 917/*
876 * Used by module code with the trace_event_mutex held for write. 918 * Used by module code with the trace_event_sem held for write.
877 */ 919 */
878int __unregister_ftrace_event(struct trace_event *event) 920int __unregister_ftrace_event(struct trace_event *event)
879{ 921{
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
888 */ 930 */
889int unregister_ftrace_event(struct trace_event *event) 931int unregister_ftrace_event(struct trace_event *event)
890{ 932{
891 down_write(&trace_event_mutex); 933 down_write(&trace_event_sem);
892 __unregister_ftrace_event(event); 934 __unregister_ftrace_event(event);
893 up_write(&trace_event_mutex); 935 up_write(&trace_event_sem);
894 936
895 return 0; 937 return 0;
896} 938}
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
1217 .funcs = &trace_user_stack_funcs, 1259 .funcs = &trace_user_stack_funcs,
1218}; 1260};
1219 1261
1262/* TRACE_BPUTS */
1263static enum print_line_t
1264trace_bputs_print(struct trace_iterator *iter, int flags,
1265 struct trace_event *event)
1266{
1267 struct trace_entry *entry = iter->ent;
1268 struct trace_seq *s = &iter->seq;
1269 struct bputs_entry *field;
1270
1271 trace_assign_type(field, entry);
1272
1273 if (!seq_print_ip_sym(s, field->ip, flags))
1274 goto partial;
1275
1276 if (!trace_seq_puts(s, ": "))
1277 goto partial;
1278
1279 if (!trace_seq_puts(s, field->str))
1280 goto partial;
1281
1282 return TRACE_TYPE_HANDLED;
1283
1284 partial:
1285 return TRACE_TYPE_PARTIAL_LINE;
1286}
1287
1288
1289static enum print_line_t
1290trace_bputs_raw(struct trace_iterator *iter, int flags,
1291 struct trace_event *event)
1292{
1293 struct bputs_entry *field;
1294 struct trace_seq *s = &iter->seq;
1295
1296 trace_assign_type(field, iter->ent);
1297
1298 if (!trace_seq_printf(s, ": %lx : ", field->ip))
1299 goto partial;
1300
1301 if (!trace_seq_puts(s, field->str))
1302 goto partial;
1303
1304 return TRACE_TYPE_HANDLED;
1305
1306 partial:
1307 return TRACE_TYPE_PARTIAL_LINE;
1308}
1309
1310static struct trace_event_functions trace_bputs_funcs = {
1311 .trace = trace_bputs_print,
1312 .raw = trace_bputs_raw,
1313};
1314
1315static struct trace_event trace_bputs_event = {
1316 .type = TRACE_BPUTS,
1317 .funcs = &trace_bputs_funcs,
1318};
1319
1220/* TRACE_BPRINT */ 1320/* TRACE_BPRINT */
1221static enum print_line_t 1321static enum print_line_t
1222trace_bprint_print(struct trace_iterator *iter, int flags, 1322trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
1329 &trace_wake_event, 1429 &trace_wake_event,
1330 &trace_stack_event, 1430 &trace_stack_event,
1331 &trace_user_stack_event, 1431 &trace_user_stack_event,
1432 &trace_bputs_event,
1332 &trace_bprint_event, 1433 &trace_bprint_event,
1333 &trace_print_event, 1434 &trace_print_event,
1334 NULL 1435 NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
5#include "trace.h" 5#include "trace.h"
6 6
7extern enum print_line_t 7extern enum print_line_t
8trace_print_bputs_msg_only(struct trace_iterator *iter);
9extern enum print_line_t
8trace_print_bprintk_msg_only(struct trace_iterator *iter); 10trace_print_bprintk_msg_only(struct trace_iterator *iter);
9extern enum print_line_t 11extern enum print_line_t
10trace_print_printk_msg_only(struct trace_iterator *iter); 12trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 33
32/* used by module unregistering */ 34/* used by module unregistering */
33extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
34extern struct rw_semaphore trace_event_mutex; 36extern struct rw_semaphore trace_event_sem;
35 37
36#define MAX_MEMHEX_BYTES 8 38#define MAX_MEMHEX_BYTES 8
37#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 39#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
28 unsigned long flags, int pc) 28 unsigned long flags, int pc)
29{ 29{
30 struct ftrace_event_call *call = &event_context_switch; 30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer; 31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event; 32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry; 33 struct ctx_switch_entry *entry;
34 34
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
69 pc = preempt_count(); 69 pc = preempt_count();
70 local_irq_save(flags); 70 local_irq_save(flags);
71 cpu = raw_smp_processor_id(); 71 cpu = raw_smp_processor_id();
72 data = ctx_trace->data[cpu]; 72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73 73
74 if (likely(!atomic_read(&data->disabled))) 74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); 75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
86 struct ftrace_event_call *call = &event_wakeup; 86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event; 87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry; 88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->buffer; 89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90 90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, 91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc); 92 sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
123 pc = preempt_count(); 123 pc = preempt_count();
124 local_irq_save(flags); 124 local_irq_save(flags);
125 cpu = raw_smp_processor_id(); 125 cpu = raw_smp_processor_id();
126 data = ctx_trace->data[cpu]; 126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127 127
128 if (likely(!atomic_read(&data->disabled))) 128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current, 129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 75aa97fbe1a1..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace); 36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace); 37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
38 38
39static int save_lat_flag; 39static int save_flags;
40static bool function_enabled;
40 41
41#define TRACE_DISPLAY_GRAPH 1 42#define TRACE_DISPLAY_GRAPH 1
42 43
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
89 if (cpu != wakeup_current_cpu) 90 if (cpu != wakeup_current_cpu)
90 goto out_enable; 91 goto out_enable;
91 92
92 *data = tr->data[cpu]; 93 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
93 disabled = atomic_inc_return(&(*data)->disabled); 94 disabled = atomic_inc_return(&(*data)->disabled);
94 if (unlikely(disabled != 1)) 95 if (unlikely(disabled != 1))
95 goto out; 96 goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
134}; 135};
135#endif /* CONFIG_FUNCTION_TRACER */ 136#endif /* CONFIG_FUNCTION_TRACER */
136 137
137static int start_func_tracer(int graph) 138static int register_wakeup_function(int graph, int set)
138{ 139{
139 int ret; 140 int ret;
140 141
141 if (!graph) 142 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
142 ret = register_ftrace_function(&trace_ops); 143 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
143 else 144 return 0;
145
146 if (graph)
144 ret = register_ftrace_graph(&wakeup_graph_return, 147 ret = register_ftrace_graph(&wakeup_graph_return,
145 &wakeup_graph_entry); 148 &wakeup_graph_entry);
149 else
150 ret = register_ftrace_function(&trace_ops);
151
152 if (!ret)
153 function_enabled = true;
154
155 return ret;
156}
157
158static void unregister_wakeup_function(int graph)
159{
160 if (!function_enabled)
161 return;
162
163 if (graph)
164 unregister_ftrace_graph();
165 else
166 unregister_ftrace_function(&trace_ops);
167
168 function_enabled = false;
169}
170
171static void wakeup_function_set(int set)
172{
173 if (set)
174 register_wakeup_function(is_graph(), 1);
175 else
176 unregister_wakeup_function(is_graph());
177}
178
179static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
180{
181 if (mask & TRACE_ITER_FUNCTION)
182 wakeup_function_set(set);
183
184 return trace_keep_overwrite(tracer, mask, set);
185}
186
187static int start_func_tracer(int graph)
188{
189 int ret;
190
191 ret = register_wakeup_function(graph, 0);
146 192
147 if (!ret && tracing_is_enabled()) 193 if (!ret && tracing_is_enabled())
148 tracer_enabled = 1; 194 tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
156{ 202{
157 tracer_enabled = 0; 203 tracer_enabled = 0;
158 204
159 if (!graph) 205 unregister_wakeup_function(graph);
160 unregister_ftrace_function(&trace_ops);
161 else
162 unregister_ftrace_graph();
163} 206}
164 207
165#ifdef CONFIG_FUNCTION_GRAPH_TRACER 208#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
353 396
354 /* disable local data, not wakeup_cpu data */ 397 /* disable local data, not wakeup_cpu data */
355 cpu = raw_smp_processor_id(); 398 cpu = raw_smp_processor_id();
356 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 399 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
357 if (likely(disabled != 1)) 400 if (likely(disabled != 1))
358 goto out; 401 goto out;
359 402
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
365 goto out_unlock; 408 goto out_unlock;
366 409
367 /* The task we are waiting for is waking up */ 410 /* The task we are waiting for is waking up */
368 data = wakeup_trace->data[wakeup_cpu]; 411 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
369 412
370 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 413 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
371 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 414 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
387 arch_spin_unlock(&wakeup_lock); 430 arch_spin_unlock(&wakeup_lock);
388 local_irq_restore(flags); 431 local_irq_restore(flags);
389out: 432out:
390 atomic_dec(&wakeup_trace->data[cpu]->disabled); 433 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
391} 434}
392 435
393static void __wakeup_reset(struct trace_array *tr) 436static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
405{ 448{
406 unsigned long flags; 449 unsigned long flags;
407 450
408 tracing_reset_online_cpus(tr); 451 tracing_reset_online_cpus(&tr->trace_buffer);
409 452
410 local_irq_save(flags); 453 local_irq_save(flags);
411 arch_spin_lock(&wakeup_lock); 454 arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
435 return; 478 return;
436 479
437 pc = preempt_count(); 480 pc = preempt_count();
438 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 481 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
439 if (unlikely(disabled != 1)) 482 if (unlikely(disabled != 1))
440 goto out; 483 goto out;
441 484
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
458 501
459 local_save_flags(flags); 502 local_save_flags(flags);
460 503
461 data = wakeup_trace->data[wakeup_cpu]; 504 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
462 data->preempt_timestamp = ftrace_now(cpu); 505 data->preempt_timestamp = ftrace_now(cpu);
463 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); 506 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
464 507
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472out_locked: 515out_locked:
473 arch_spin_unlock(&wakeup_lock); 516 arch_spin_unlock(&wakeup_lock);
474out: 517out:
475 atomic_dec(&wakeup_trace->data[cpu]->disabled); 518 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
476} 519}
477 520
478static void start_wakeup_tracer(struct trace_array *tr) 521static void start_wakeup_tracer(struct trace_array *tr)
@@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
540 583
541static int __wakeup_tracer_init(struct trace_array *tr) 584static int __wakeup_tracer_init(struct trace_array *tr)
542{ 585{
543 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; 586 save_flags = trace_flags;
544 trace_flags |= TRACE_ITER_LATENCY_FMT; 587
588 /* non overwrite screws up the latency tracers */
589 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
590 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
545 591
546 tracing_max_latency = 0; 592 tracing_max_latency = 0;
547 wakeup_trace = tr; 593 wakeup_trace = tr;
@@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
563 609
564static void wakeup_tracer_reset(struct trace_array *tr) 610static void wakeup_tracer_reset(struct trace_array *tr)
565{ 611{
612 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
613 int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
614
566 stop_wakeup_tracer(tr); 615 stop_wakeup_tracer(tr);
567 /* make sure we put back any tasks we are tracing */ 616 /* make sure we put back any tasks we are tracing */
568 wakeup_reset(tr); 617 wakeup_reset(tr);
569 618
570 if (!save_lat_flag) 619 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
571 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 620 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
572} 621}
573 622
574static void wakeup_tracer_start(struct trace_array *tr) 623static void wakeup_tracer_start(struct trace_array *tr)
@@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
594 .print_line = wakeup_print_line, 643 .print_line = wakeup_print_line,
595 .flags = &tracer_flags, 644 .flags = &tracer_flags,
596 .set_flag = wakeup_set_flag, 645 .set_flag = wakeup_set_flag,
646 .flag_changed = wakeup_flag_changed,
597#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
598 .selftest = trace_selftest_startup_wakeup, 648 .selftest = trace_selftest_startup_wakeup,
599#endif 649#endif
@@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
615 .print_line = wakeup_print_line, 665 .print_line = wakeup_print_line,
616 .flags = &tracer_flags, 666 .flags = &tracer_flags,
617 .set_flag = wakeup_set_flag, 667 .set_flag = wakeup_set_flag,
668 .flag_changed = wakeup_flag_changed,
618#ifdef CONFIG_FTRACE_SELFTEST 669#ifdef CONFIG_FTRACE_SELFTEST
619 .selftest = trace_selftest_startup_wakeup, 670 .selftest = trace_selftest_startup_wakeup,
620#endif 671#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..2901e3b88590 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
21 return 0; 21 return 0;
22} 22}
23 23
24static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) 24static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
25{ 25{
26 struct ring_buffer_event *event; 26 struct ring_buffer_event *event;
27 struct trace_entry *entry; 27 struct trace_entry *entry;
28 unsigned int loops = 0; 28 unsigned int loops = 0;
29 29
30 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { 30 while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
31 entry = ring_buffer_event_data(event); 31 entry = ring_buffer_event_data(event);
32 32
33 /* 33 /*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
58 * Test the trace buffer to see if all the elements 58 * Test the trace buffer to see if all the elements
59 * are still sane. 59 * are still sane.
60 */ 60 */
61static int trace_test_buffer(struct trace_array *tr, unsigned long *count) 61static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
62{ 62{
63 unsigned long flags, cnt = 0; 63 unsigned long flags, cnt = 0;
64 int cpu, ret = 0; 64 int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&ftrace_max_lock);
69 69
70 cnt = ring_buffer_entries(tr->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
72 /* 72 /*
73 * The trace_test_buffer_cpu runs a while loop to consume all data. 73 * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
78 */ 78 */
79 tracing_off(); 79 tracing_off();
80 for_each_possible_cpu(cpu) { 80 for_each_possible_cpu(cpu) {
81 ret = trace_test_buffer_cpu(tr, cpu); 81 ret = trace_test_buffer_cpu(buf, cpu);
82 if (ret) 82 if (ret)
83 break; 83 break;
84 } 84 }
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
355 msleep(100); 355 msleep(100);
356 356
357 /* we should have nothing in the buffer */ 357 /* we should have nothing in the buffer */
358 ret = trace_test_buffer(tr, &count); 358 ret = trace_test_buffer(&tr->trace_buffer, &count);
359 if (ret) 359 if (ret)
360 goto out; 360 goto out;
361 361
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
376 ftrace_enabled = 0; 376 ftrace_enabled = 0;
377 377
378 /* check the trace buffer */ 378 /* check the trace buffer */
379 ret = trace_test_buffer(tr, &count); 379 ret = trace_test_buffer(&tr->trace_buffer, &count);
380 tracing_start(); 380 tracing_start();
381 381
382 /* we should only have one item */ 382 /* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
666 ftrace_enabled = 0; 666 ftrace_enabled = 0;
667 667
668 /* check the trace buffer */ 668 /* check the trace buffer */
669 ret = trace_test_buffer(tr, &count); 669 ret = trace_test_buffer(&tr->trace_buffer, &count);
670 trace->reset(tr); 670 trace->reset(tr);
671 tracing_start(); 671 tracing_start();
672 672
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
703/* Maximum number of functions to trace before diagnosing a hang */ 703/* Maximum number of functions to trace before diagnosing a hang */
704#define GRAPH_MAX_FUNC_TEST 100000000 704#define GRAPH_MAX_FUNC_TEST 100000000
705 705
706static void
707__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
708static unsigned int graph_hang_thresh; 706static unsigned int graph_hang_thresh;
709 707
710/* Wrap the real function entry probe to avoid possible hanging */ 708/* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
714 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { 712 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
715 ftrace_graph_stop(); 713 ftrace_graph_stop();
716 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 714 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
717 if (ftrace_dump_on_oops) 715 if (ftrace_dump_on_oops) {
718 __ftrace_dump(false, DUMP_ALL); 716 ftrace_dump(DUMP_ALL);
717 /* ftrace_dump() disables tracing */
718 tracing_on();
719 }
719 return 0; 720 return 0;
720 } 721 }
721 722
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
737 * Simulate the init() callback but we attach a watchdog callback 738 * Simulate the init() callback but we attach a watchdog callback
738 * to detect and recover from possible hangs 739 * to detect and recover from possible hangs
739 */ 740 */
740 tracing_reset_online_cpus(tr); 741 tracing_reset_online_cpus(&tr->trace_buffer);
741 set_graph_array(tr); 742 set_graph_array(tr);
742 ret = register_ftrace_graph(&trace_graph_return, 743 ret = register_ftrace_graph(&trace_graph_return,
743 &trace_graph_entry_watchdog); 744 &trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
760 tracing_stop(); 761 tracing_stop();
761 762
762 /* check the trace buffer */ 763 /* check the trace buffer */
763 ret = trace_test_buffer(tr, &count); 764 ret = trace_test_buffer(&tr->trace_buffer, &count);
764 765
765 trace->reset(tr); 766 trace->reset(tr);
766 tracing_start(); 767 tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
815 /* stop the tracing. */ 816 /* stop the tracing. */
816 tracing_stop(); 817 tracing_stop();
817 /* check both trace buffers */ 818 /* check both trace buffers */
818 ret = trace_test_buffer(tr, NULL); 819 ret = trace_test_buffer(&tr->trace_buffer, NULL);
819 if (!ret) 820 if (!ret)
820 ret = trace_test_buffer(&max_tr, &count); 821 ret = trace_test_buffer(&tr->max_buffer, &count);
821 trace->reset(tr); 822 trace->reset(tr);
822 tracing_start(); 823 tracing_start();
823 824
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
877 /* stop the tracing. */ 878 /* stop the tracing. */
878 tracing_stop(); 879 tracing_stop();
879 /* check both trace buffers */ 880 /* check both trace buffers */
880 ret = trace_test_buffer(tr, NULL); 881 ret = trace_test_buffer(&tr->trace_buffer, NULL);
881 if (!ret) 882 if (!ret)
882 ret = trace_test_buffer(&max_tr, &count); 883 ret = trace_test_buffer(&tr->max_buffer, &count);
883 trace->reset(tr); 884 trace->reset(tr);
884 tracing_start(); 885 tracing_start();
885 886
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
943 /* stop the tracing. */ 944 /* stop the tracing. */
944 tracing_stop(); 945 tracing_stop();
945 /* check both trace buffers */ 946 /* check both trace buffers */
946 ret = trace_test_buffer(tr, NULL); 947 ret = trace_test_buffer(&tr->trace_buffer, NULL);
947 if (ret) 948 if (ret)
948 goto out; 949 goto out;
949 950
950 ret = trace_test_buffer(&max_tr, &count); 951 ret = trace_test_buffer(&tr->max_buffer, &count);
951 if (ret) 952 if (ret)
952 goto out; 953 goto out;
953 954
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 /* stop the tracing. */ 974 /* stop the tracing. */
974 tracing_stop(); 975 tracing_stop();
975 /* check both trace buffers */ 976 /* check both trace buffers */
976 ret = trace_test_buffer(tr, NULL); 977 ret = trace_test_buffer(&tr->trace_buffer, NULL);
977 if (ret) 978 if (ret)
978 goto out; 979 goto out;
979 980
980 ret = trace_test_buffer(&max_tr, &count); 981 ret = trace_test_buffer(&tr->max_buffer, &count);
981 982
982 if (!ret && !count) { 983 if (!ret && !count) {
983 printk(KERN_CONT ".. no entries found .."); 984 printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1084 /* stop the tracing. */ 1085 /* stop the tracing. */
1085 tracing_stop(); 1086 tracing_stop();
1086 /* check both trace buffers */ 1087 /* check both trace buffers */
1087 ret = trace_test_buffer(tr, NULL); 1088 ret = trace_test_buffer(&tr->trace_buffer, NULL);
1088 printk("ret = %d\n", ret); 1089 printk("ret = %d\n", ret);
1089 if (!ret) 1090 if (!ret)
1090 ret = trace_test_buffer(&max_tr, &count); 1091 ret = trace_test_buffer(&tr->max_buffer, &count);
1091 1092
1092 1093
1093 trace->reset(tr); 1094 trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
1126 /* stop the tracing. */ 1127 /* stop the tracing. */
1127 tracing_stop(); 1128 tracing_stop();
1128 /* check the trace buffer */ 1129 /* check the trace buffer */
1129 ret = trace_test_buffer(tr, &count); 1130 ret = trace_test_buffer(&tr->trace_buffer, &count);
1130 trace->reset(tr); 1131 trace->reset(tr);
1131 tracing_start(); 1132 tracing_start();
1132 1133
@@ -1158,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
1158 /* stop the tracing. */ 1159 /* stop the tracing. */
1159 tracing_stop(); 1160 tracing_stop();
1160 /* check the trace buffer */ 1161 /* check the trace buffer */
1161 ret = trace_test_buffer(tr, &count); 1162 ret = trace_test_buffer(&tr->trace_buffer, &count);
1162 trace->reset(tr); 1163 trace->reset(tr);
1163 tracing_start(); 1164 tracing_start();
1164 1165
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..b20428c5efe2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
20 20
21#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
22 22
23#ifdef CC_USING_FENTRY
24# define fentry 1
25#else
26# define fentry 0
27#endif
28
23static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = 29static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
24 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; 30 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
25static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; 31static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
26 32
33/*
34 * Reserve one entry for the passed in ip. This will allow
35 * us to remove most or all of the stack size overhead
36 * added by the stack tracer itself.
37 */
27static struct stack_trace max_stack_trace = { 38static struct stack_trace max_stack_trace = {
28 .max_entries = STACK_TRACE_ENTRIES, 39 .max_entries = STACK_TRACE_ENTRIES - 1,
29 .entries = stack_dump_trace, 40 .entries = &stack_dump_trace[1],
30}; 41};
31 42
32static unsigned long max_stack_size; 43static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
39int stack_tracer_enabled; 50int stack_tracer_enabled;
40static int last_stack_tracer_enabled; 51static int last_stack_tracer_enabled;
41 52
42static inline void check_stack(void) 53static inline void
54check_stack(unsigned long ip, unsigned long *stack)
43{ 55{
44 unsigned long this_size, flags; 56 unsigned long this_size, flags;
45 unsigned long *p, *top, *start; 57 unsigned long *p, *top, *start;
58 static int tracer_frame;
59 int frame_size = ACCESS_ONCE(tracer_frame);
46 int i; 60 int i;
47 61
48 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); 62 this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
49 this_size = THREAD_SIZE - this_size; 63 this_size = THREAD_SIZE - this_size;
64 /* Remove the frame of the tracer */
65 this_size -= frame_size;
50 66
51 if (this_size <= max_stack_size) 67 if (this_size <= max_stack_size)
52 return; 68 return;
53 69
54 /* we do not handle interrupt stacks yet */ 70 /* we do not handle interrupt stacks yet */
55 if (!object_is_on_stack(&this_size)) 71 if (!object_is_on_stack(stack))
56 return; 72 return;
57 73
58 local_irq_save(flags); 74 local_irq_save(flags);
59 arch_spin_lock(&max_stack_lock); 75 arch_spin_lock(&max_stack_lock);
60 76
77 /* In case another CPU set the tracer_frame on us */
78 if (unlikely(!frame_size))
79 this_size -= tracer_frame;
80
61 /* a race could have already updated it */ 81 /* a race could have already updated it */
62 if (this_size <= max_stack_size) 82 if (this_size <= max_stack_size)
63 goto out; 83 goto out;
@@ -70,10 +90,18 @@ static inline void check_stack(void)
70 save_stack_trace(&max_stack_trace); 90 save_stack_trace(&max_stack_trace);
71 91
72 /* 92 /*
93 * Add the passed in ip from the function tracer.
94 * Searching for this on the stack will skip over
95 * most of the overhead from the stack tracer itself.
96 */
97 stack_dump_trace[0] = ip;
98 max_stack_trace.nr_entries++;
99
100 /*
73 * Now find where in the stack these are. 101 * Now find where in the stack these are.
74 */ 102 */
75 i = 0; 103 i = 0;
76 start = &this_size; 104 start = stack;
77 top = (unsigned long *) 105 top = (unsigned long *)
78 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); 106 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
79 107
@@ -97,6 +125,18 @@ static inline void check_stack(void)
97 found = 1; 125 found = 1;
98 /* Start the search from here */ 126 /* Start the search from here */
99 start = p + 1; 127 start = p + 1;
128 /*
129 * We do not want to show the overhead
130 * of the stack tracer stack in the
131 * max stack. If we haven't figured
132 * out what that is, then figure it out
133 * now.
134 */
135 if (unlikely(!tracer_frame) && i == 1) {
136 tracer_frame = (p - stack) *
137 sizeof(unsigned long);
138 max_stack_size -= tracer_frame;
139 }
100 } 140 }
101 } 141 }
102 142
@@ -113,6 +153,7 @@ static void
113stack_trace_call(unsigned long ip, unsigned long parent_ip, 153stack_trace_call(unsigned long ip, unsigned long parent_ip,
114 struct ftrace_ops *op, struct pt_regs *pt_regs) 154 struct ftrace_ops *op, struct pt_regs *pt_regs)
115{ 155{
156 unsigned long stack;
116 int cpu; 157 int cpu;
117 158
118 preempt_disable_notrace(); 159 preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
122 if (per_cpu(trace_active, cpu)++ != 0) 163 if (per_cpu(trace_active, cpu)++ != 0)
123 goto out; 164 goto out;
124 165
125 check_stack(); 166 /*
167 * When fentry is used, the traced function does not get
168 * its stack frame set up, and we lose the parent.
169 * The ip is pretty useless because the function tracer
170 * was called before that function set up its stack frame.
171 * In this case, we use the parent ip.
172 *
173 * By adding the return address of either the parent ip
174 * or the current ip we can disregard most of the stack usage
175 * caused by the stack tracer itself.
176 *
177 * The function tracer always reports the address of where the
178 * mcount call was, but the stack will hold the return address.
179 */
180 if (fentry)
181 ip = parent_ip;
182 else
183 ip += MCOUNT_INSN_SIZE;
184
185 check_stack(ip, &stack);
126 186
127 out: 187 out:
128 per_cpu(trace_active, cpu)--; 188 per_cpu(trace_active, cpu)--;
@@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
322 .open = stack_trace_filter_open, 382 .open = stack_trace_filter_open,
323 .read = seq_read, 383 .read = seq_read,
324 .write = ftrace_filter_write, 384 .write = ftrace_filter_write,
325 .llseek = ftrace_regex_lseek, 385 .llseek = ftrace_filter_lseek,
326 .release = ftrace_regex_release, 386 .release = ftrace_regex_release,
327}; 387};
328 388
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
371 struct dentry *d_tracer; 431 struct dentry *d_tracer;
372 432
373 d_tracer = tracing_init_dentry(); 433 d_tracer = tracing_init_dentry();
434 if (!d_tracer)
435 return 0;
374 436
375 trace_create_file("stack_max_size", 0644, d_tracer, 437 trace_create_file("stack_max_size", 0644, d_tracer,
376 &max_stack_size, &stack_max_size_fops); 438 &max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
307 struct dentry *d_tracing; 307 struct dentry *d_tracing;
308 308
309 d_tracing = tracing_init_dentry(); 309 d_tracing = tracing_init_dentry();
310 if (!d_tracing)
311 return 0;
310 312
311 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 313 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
312 if (!stat_dir) 314 if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
12#include "trace.h" 12#include "trace.h"
13 13
14static DEFINE_MUTEX(syscall_trace_lock); 14static DEFINE_MUTEX(syscall_trace_lock);
15static int sys_refcount_enter;
16static int sys_refcount_exit;
17static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
18static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
19 15
20static int syscall_enter_register(struct ftrace_event_call *event, 16static int syscall_enter_register(struct ftrace_event_call *event,
21 enum trace_reg type, void *data); 17 enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
41 /* 37 /*
42 * Only compare after the "sys" prefix. Archs that use 38 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed 39 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with "SyS" instead of "sys", leading to an unwanted 40 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch. 41 * mismatch.
46 */ 42 */
47 return !strcmp(sym + 3, name + 3); 43 return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
265 kfree(call->print_fmt); 261 kfree(call->print_fmt);
266} 262}
267 263
268static int syscall_enter_define_fields(struct ftrace_event_call *call) 264static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
269{ 265{
270 struct syscall_trace_enter trace; 266 struct syscall_trace_enter trace;
271 struct syscall_metadata *meta = call->data; 267 struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
288 return ret; 284 return ret;
289} 285}
290 286
291static int syscall_exit_define_fields(struct ftrace_event_call *call) 287static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
292{ 288{
293 struct syscall_trace_exit trace; 289 struct syscall_trace_exit trace;
294 int ret; 290 int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
303 return ret; 299 return ret;
304} 300}
305 301
306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
307{ 303{
304 struct trace_array *tr = data;
308 struct syscall_trace_enter *entry; 305 struct syscall_trace_enter *entry;
309 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
310 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
315 syscall_nr = trace_get_syscall_nr(current, regs); 312 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 313 if (syscall_nr < 0)
317 return; 314 return;
318 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 315 if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
319 return; 316 return;
320 317
321 sys_data = syscall_nr_to_meta(syscall_nr); 318 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
324 321
325 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
326 323
327 event = trace_current_buffer_lock_reserve(&buffer, 324 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer,
328 sys_data->enter_event->event.type, size, 0, 0); 326 sys_data->enter_event->event.type, size, 0, 0);
329 if (!event) 327 if (!event)
330 return; 328 return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
338 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 336 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
339} 337}
340 338
341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
342{ 340{
341 struct trace_array *tr = data;
343 struct syscall_trace_exit *entry; 342 struct syscall_trace_exit *entry;
344 struct syscall_metadata *sys_data; 343 struct syscall_metadata *sys_data;
345 struct ring_buffer_event *event; 344 struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
349 syscall_nr = trace_get_syscall_nr(current, regs); 348 syscall_nr = trace_get_syscall_nr(current, regs);
350 if (syscall_nr < 0) 349 if (syscall_nr < 0)
351 return; 350 return;
352 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 351 if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
353 return; 352 return;
354 353
355 sys_data = syscall_nr_to_meta(syscall_nr); 354 sys_data = syscall_nr_to_meta(syscall_nr);
356 if (!sys_data) 355 if (!sys_data)
357 return; 356 return;
358 357
359 event = trace_current_buffer_lock_reserve(&buffer, 358 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
361 if (!event) 361 if (!event)
362 return; 362 return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
371} 371}
372 372
373static int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_file *file,
374 struct ftrace_event_call *call)
374{ 375{
376 struct trace_array *tr = file->tr;
375 int ret = 0; 377 int ret = 0;
376 int num; 378 int num;
377 379
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
379 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 381 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
380 return -ENOSYS; 382 return -ENOSYS;
381 mutex_lock(&syscall_trace_lock); 383 mutex_lock(&syscall_trace_lock);
382 if (!sys_refcount_enter) 384 if (!tr->sys_refcount_enter)
383 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 385 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
384 if (!ret) { 386 if (!ret) {
385 set_bit(num, enabled_enter_syscalls); 387 set_bit(num, tr->enabled_enter_syscalls);
386 sys_refcount_enter++; 388 tr->sys_refcount_enter++;
387 } 389 }
388 mutex_unlock(&syscall_trace_lock); 390 mutex_unlock(&syscall_trace_lock);
389 return ret; 391 return ret;
390} 392}
391 393
392static void unreg_event_syscall_enter(struct ftrace_event_call *call) 394static void unreg_event_syscall_enter(struct ftrace_event_file *file,
395 struct ftrace_event_call *call)
393{ 396{
397 struct trace_array *tr = file->tr;
394 int num; 398 int num;
395 399
396 num = ((struct syscall_metadata *)call->data)->syscall_nr; 400 num = ((struct syscall_metadata *)call->data)->syscall_nr;
397 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 401 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
398 return; 402 return;
399 mutex_lock(&syscall_trace_lock); 403 mutex_lock(&syscall_trace_lock);
400 sys_refcount_enter--; 404 tr->sys_refcount_enter--;
401 clear_bit(num, enabled_enter_syscalls); 405 clear_bit(num, tr->enabled_enter_syscalls);
402 if (!sys_refcount_enter) 406 if (!tr->sys_refcount_enter)
403 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 407 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
404 mutex_unlock(&syscall_trace_lock); 408 mutex_unlock(&syscall_trace_lock);
405} 409}
406 410
407static int reg_event_syscall_exit(struct ftrace_event_call *call) 411static int reg_event_syscall_exit(struct ftrace_event_file *file,
412 struct ftrace_event_call *call)
408{ 413{
414 struct trace_array *tr = file->tr;
409 int ret = 0; 415 int ret = 0;
410 int num; 416 int num;
411 417
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
413 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 419 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
414 return -ENOSYS; 420 return -ENOSYS;
415 mutex_lock(&syscall_trace_lock); 421 mutex_lock(&syscall_trace_lock);
416 if (!sys_refcount_exit) 422 if (!tr->sys_refcount_exit)
417 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 423 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
418 if (!ret) { 424 if (!ret) {
419 set_bit(num, enabled_exit_syscalls); 425 set_bit(num, tr->enabled_exit_syscalls);
420 sys_refcount_exit++; 426 tr->sys_refcount_exit++;
421 } 427 }
422 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
423 return ret; 429 return ret;
424} 430}
425 431
426static void unreg_event_syscall_exit(struct ftrace_event_call *call) 432static void unreg_event_syscall_exit(struct ftrace_event_file *file,
433 struct ftrace_event_call *call)
427{ 434{
435 struct trace_array *tr = file->tr;
428 int num; 436 int num;
429 437
430 num = ((struct syscall_metadata *)call->data)->syscall_nr; 438 num = ((struct syscall_metadata *)call->data)->syscall_nr;
431 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 439 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
432 return; 440 return;
433 mutex_lock(&syscall_trace_lock); 441 mutex_lock(&syscall_trace_lock);
434 sys_refcount_exit--; 442 tr->sys_refcount_exit--;
435 clear_bit(num, enabled_exit_syscalls); 443 clear_bit(num, tr->enabled_exit_syscalls);
436 if (!sys_refcount_exit) 444 if (!tr->sys_refcount_exit)
437 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 445 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
438 mutex_unlock(&syscall_trace_lock); 446 mutex_unlock(&syscall_trace_lock);
439} 447}
440 448
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
471 .trace = print_syscall_exit, 479 .trace = print_syscall_exit,
472}; 480};
473 481
474struct ftrace_event_class event_class_syscall_enter = { 482struct ftrace_event_class __refdata event_class_syscall_enter = {
475 .system = "syscalls", 483 .system = "syscalls",
476 .reg = syscall_enter_register, 484 .reg = syscall_enter_register,
477 .define_fields = syscall_enter_define_fields, 485 .define_fields = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
479 .raw_init = init_syscall_trace, 487 .raw_init = init_syscall_trace,
480}; 488};
481 489
482struct ftrace_event_class event_class_syscall_exit = { 490struct ftrace_event_class __refdata event_class_syscall_exit = {
483 .system = "syscalls", 491 .system = "syscalls",
484 .reg = syscall_exit_register, 492 .reg = syscall_exit_register,
485 .define_fields = syscall_exit_define_fields, 493 .define_fields = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
685static int syscall_enter_register(struct ftrace_event_call *event, 693static int syscall_enter_register(struct ftrace_event_call *event,
686 enum trace_reg type, void *data) 694 enum trace_reg type, void *data)
687{ 695{
696 struct ftrace_event_file *file = data;
697
688 switch (type) { 698 switch (type) {
689 case TRACE_REG_REGISTER: 699 case TRACE_REG_REGISTER:
690 return reg_event_syscall_enter(event); 700 return reg_event_syscall_enter(file, event);
691 case TRACE_REG_UNREGISTER: 701 case TRACE_REG_UNREGISTER:
692 unreg_event_syscall_enter(event); 702 unreg_event_syscall_enter(file, event);
693 return 0; 703 return 0;
694 704
695#ifdef CONFIG_PERF_EVENTS 705#ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
711static int syscall_exit_register(struct ftrace_event_call *event, 721static int syscall_exit_register(struct ftrace_event_call *event,
712 enum trace_reg type, void *data) 722 enum trace_reg type, void *data)
713{ 723{
724 struct ftrace_event_file *file = data;
725
714 switch (type) { 726 switch (type) {
715 case TRACE_REG_REGISTER: 727 case TRACE_REG_REGISTER:
716 return reg_event_syscall_exit(event); 728 return reg_event_syscall_exit(file, event);
717 case TRACE_REG_UNREGISTER: 729 case TRACE_REG_UNREGISTER:
718 unreg_event_syscall_exit(event); 730 unreg_event_syscall_exit(file, event);
719 return 0; 731 return 0;
720 732
721#ifdef CONFIG_PERF_EVENTS 733#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8dad2a92dee9..32494fb0ee64 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,6 +28,18 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct uprobe_trace_entry_head {
32 struct trace_entry ent;
33 unsigned long vaddr[];
34};
35
36#define SIZEOF_TRACE_ENTRY(is_return) \
37 (sizeof(struct uprobe_trace_entry_head) + \
38 sizeof(unsigned long) * (is_return ? 2 : 1))
39
40#define DATAOF_TRACE_ENTRY(entry, is_return) \
41 ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
42
31struct trace_uprobe_filter { 43struct trace_uprobe_filter {
32 rwlock_t rwlock; 44 rwlock_t rwlock;
33 int nr_systemwide; 45 int nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
64static LIST_HEAD(uprobe_list); 76static LIST_HEAD(uprobe_list);
65 77
66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
79static int uretprobe_dispatcher(struct uprobe_consumer *con,
80 unsigned long func, struct pt_regs *regs);
67 81
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) 82static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{ 83{
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
77 return !filter->nr_systemwide && list_empty(&filter->perf_events); 91 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78} 92}
79 93
94static inline bool is_ret_probe(struct trace_uprobe *tu)
95{
96 return tu->consumer.ret_handler != NULL;
97}
98
80/* 99/*
81 * Allocate new trace_uprobe and initialize it (including uprobes). 100 * Allocate new trace_uprobe and initialize it (including uprobes).
82 */ 101 */
83static struct trace_uprobe * 102static struct trace_uprobe *
84alloc_trace_uprobe(const char *group, const char *event, int nargs) 103alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
85{ 104{
86 struct trace_uprobe *tu; 105 struct trace_uprobe *tu;
87 106
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
106 125
107 INIT_LIST_HEAD(&tu->list); 126 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher; 127 tu->consumer.handler = uprobe_dispatcher;
128 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter); 130 init_trace_uprobe_filter(&tu->filter);
110 return tu; 131 return tu;
111 132
@@ -180,7 +201,7 @@ end:
180 201
181/* 202/*
182 * Argument syntax: 203 * Argument syntax:
183 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] 204 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
184 * 205 *
185 * - Remove uprobe: -:[GRP/]EVENT 206 * - Remove uprobe: -:[GRP/]EVENT
186 */ 207 */
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
192 char buf[MAX_EVENT_NAME_LEN]; 213 char buf[MAX_EVENT_NAME_LEN];
193 struct path path; 214 struct path path;
194 unsigned long offset; 215 unsigned long offset;
195 bool is_delete; 216 bool is_delete, is_return;
196 int i, ret; 217 int i, ret;
197 218
198 inode = NULL; 219 inode = NULL;
199 ret = 0; 220 ret = 0;
200 is_delete = false; 221 is_delete = false;
222 is_return = false;
201 event = NULL; 223 event = NULL;
202 group = NULL; 224 group = NULL;
203 225
204 /* argc must be >= 1 */ 226 /* argc must be >= 1 */
205 if (argv[0][0] == '-') 227 if (argv[0][0] == '-')
206 is_delete = true; 228 is_delete = true;
229 else if (argv[0][0] == 'r')
230 is_return = true;
207 else if (argv[0][0] != 'p') { 231 else if (argv[0][0] != 'p') {
208 pr_info("Probe definition must be started with 'p' or '-'.\n"); 232 pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
209 return -EINVAL; 233 return -EINVAL;
210 } 234 }
211 235
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
303 kfree(tail); 327 kfree(tail);
304 } 328 }
305 329
306 tu = alloc_trace_uprobe(group, event, argc); 330 tu = alloc_trace_uprobe(group, event, argc, is_return);
307 if (IS_ERR(tu)) { 331 if (IS_ERR(tu)) {
308 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); 332 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
309 ret = PTR_ERR(tu); 333 ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
414static int probes_seq_show(struct seq_file *m, void *v) 438static int probes_seq_show(struct seq_file *m, void *v)
415{ 439{
416 struct trace_uprobe *tu = v; 440 struct trace_uprobe *tu = v;
441 char c = is_ret_probe(tu) ? 'r' : 'p';
417 int i; 442 int i;
418 443
419 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); 444 seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
420 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 445 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
421 446
422 for (i = 0; i < tu->nr_args; i++) 447 for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
485 .release = seq_release, 510 .release = seq_release,
486}; 511};
487 512
488/* uprobe handler */ 513static void uprobe_trace_print(struct trace_uprobe *tu,
489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 514 unsigned long func, struct pt_regs *regs)
490{ 515{
491 struct uprobe_trace_entry_head *entry; 516 struct uprobe_trace_entry_head *entry;
492 struct ring_buffer_event *event; 517 struct ring_buffer_event *event;
493 struct ring_buffer *buffer; 518 struct ring_buffer *buffer;
494 u8 *data; 519 void *data;
495 int size, i, pc; 520 int size, i;
496 unsigned long irq_flags;
497 struct ftrace_event_call *call = &tu->call; 521 struct ftrace_event_call *call = &tu->call;
498 522
499 local_save_flags(irq_flags); 523 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
500 pc = preempt_count();
501
502 size = sizeof(*entry) + tu->size;
503
504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 524 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
505 size, irq_flags, pc); 525 size + tu->size, 0, 0);
506 if (!event) 526 if (!event)
507 return 0; 527 return;
508 528
509 entry = ring_buffer_event_data(event); 529 entry = ring_buffer_event_data(event);
510 entry->ip = instruction_pointer(task_pt_regs(current)); 530 if (is_ret_probe(tu)) {
511 data = (u8 *)&entry[1]; 531 entry->vaddr[0] = func;
532 entry->vaddr[1] = instruction_pointer(regs);
533 data = DATAOF_TRACE_ENTRY(entry, true);
534 } else {
535 entry->vaddr[0] = instruction_pointer(regs);
536 data = DATAOF_TRACE_ENTRY(entry, false);
537 }
538
512 for (i = 0; i < tu->nr_args; i++) 539 for (i = 0; i < tu->nr_args; i++)
513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 540 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
514 541
515 if (!filter_current_check_discard(buffer, call, entry, event)) 542 if (!filter_current_check_discard(buffer, call, entry, event))
516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 543 trace_buffer_unlock_commit(buffer, event, 0, 0);
544}
517 545
546/* uprobe handler */
547static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
548{
549 if (!is_ret_probe(tu))
550 uprobe_trace_print(tu, 0, regs);
518 return 0; 551 return 0;
519} 552}
520 553
554static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
555 struct pt_regs *regs)
556{
557 uprobe_trace_print(tu, func, regs);
558}
559
521/* Event entry printers */ 560/* Event entry printers */
522static enum print_line_t 561static enum print_line_t
523print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) 562print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
524{ 563{
525 struct uprobe_trace_entry_head *field; 564 struct uprobe_trace_entry_head *entry;
526 struct trace_seq *s = &iter->seq; 565 struct trace_seq *s = &iter->seq;
527 struct trace_uprobe *tu; 566 struct trace_uprobe *tu;
528 u8 *data; 567 u8 *data;
529 int i; 568 int i;
530 569
531 field = (struct uprobe_trace_entry_head *)iter->ent; 570 entry = (struct uprobe_trace_entry_head *)iter->ent;
532 tu = container_of(event, struct trace_uprobe, call.event); 571 tu = container_of(event, struct trace_uprobe, call.event);
533 572
534 if (!trace_seq_printf(s, "%s: (", tu->call.name)) 573 if (is_ret_probe(tu)) {
535 goto partial; 574 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
536 575 entry->vaddr[1], entry->vaddr[0]))
537 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 576 goto partial;
538 goto partial; 577 data = DATAOF_TRACE_ENTRY(entry, true);
539 578 } else {
540 if (!trace_seq_puts(s, ")")) 579 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
541 goto partial; 580 entry->vaddr[0]))
581 goto partial;
582 data = DATAOF_TRACE_ENTRY(entry, false);
583 }
542 584
543 data = (u8 *)&field[1];
544 for (i = 0; i < tu->nr_args; i++) { 585 for (i = 0; i < tu->nr_args; i++) {
545 if (!tu->args[i].type->print(s, tu->args[i].name, 586 if (!tu->args[i].type->print(s, tu->args[i].name,
546 data + tu->args[i].offset, field)) 587 data + tu->args[i].offset, entry))
547 goto partial; 588 goto partial;
548 } 589 }
549 590
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
595 636
596static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 637static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
597{ 638{
598 int ret, i; 639 int ret, i, size;
599 struct uprobe_trace_entry_head field; 640 struct uprobe_trace_entry_head field;
600 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; 641 struct trace_uprobe *tu = event_call->data;
601 642
602 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 643 if (is_ret_probe(tu)) {
644 DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
645 DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
646 size = SIZEOF_TRACE_ENTRY(true);
647 } else {
648 DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
649 size = SIZEOF_TRACE_ENTRY(false);
650 }
603 /* Set argument names as fields */ 651 /* Set argument names as fields */
604 for (i = 0; i < tu->nr_args; i++) { 652 for (i = 0; i < tu->nr_args; i++) {
605 ret = trace_define_field(event_call, tu->args[i].type->fmttype, 653 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
606 tu->args[i].name, 654 tu->args[i].name,
607 sizeof(field) + tu->args[i].offset, 655 size + tu->args[i].offset,
608 tu->args[i].type->size, 656 tu->args[i].type->size,
609 tu->args[i].type->is_signed, 657 tu->args[i].type->is_signed,
610 FILTER_OTHER); 658 FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
622 int i; 670 int i;
623 int pos = 0; 671 int pos = 0;
624 672
625 fmt = "(%lx)"; 673 if (is_ret_probe(tu)) {
626 arg = "REC->" FIELD_STRING_IP; 674 fmt = "(%lx <- %lx)";
675 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
676 } else {
677 fmt = "(%lx)";
678 arg = "REC->" FIELD_STRING_IP;
679 }
627 680
628 /* When len=0, we just calculate the needed length */ 681 /* When len=0, we just calculate the needed length */
629 682
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
752 return ret; 805 return ret;
753} 806}
754 807
755/* uprobe profile handler */ 808static void uprobe_perf_print(struct trace_uprobe *tu,
756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 809 unsigned long func, struct pt_regs *regs)
757{ 810{
758 struct ftrace_event_call *call = &tu->call; 811 struct ftrace_event_call *call = &tu->call;
759 struct uprobe_trace_entry_head *entry; 812 struct uprobe_trace_entry_head *entry;
760 struct hlist_head *head; 813 struct hlist_head *head;
761 u8 *data; 814 void *data;
762 int size, __size, i; 815 int size, rctx, i;
763 int rctx;
764 816
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 817 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
766 return UPROBE_HANDLER_REMOVE; 818 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
767
768 __size = sizeof(*entry) + tu->size;
769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
770 size -= sizeof(u32);
771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 819 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
772 return 0; 820 return;
773 821
774 preempt_disable(); 822 preempt_disable();
823 head = this_cpu_ptr(call->perf_events);
824 if (hlist_empty(head))
825 goto out;
775 826
776 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 827 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
777 if (!entry) 828 if (!entry)
778 goto out; 829 goto out;
779 830
780 entry->ip = instruction_pointer(task_pt_regs(current)); 831 if (is_ret_probe(tu)) {
781 data = (u8 *)&entry[1]; 832 entry->vaddr[0] = func;
833 entry->vaddr[1] = instruction_pointer(regs);
834 data = DATAOF_TRACE_ENTRY(entry, true);
835 } else {
836 entry->vaddr[0] = instruction_pointer(regs);
837 data = DATAOF_TRACE_ENTRY(entry, false);
838 }
839
782 for (i = 0; i < tu->nr_args; i++) 840 for (i = 0; i < tu->nr_args; i++)
783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 841 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
784 842
785 head = this_cpu_ptr(call->perf_events); 843 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
786 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
787
788 out: 844 out:
789 preempt_enable(); 845 preempt_enable();
846}
847
848/* uprobe profile handler */
849static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
850{
851 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
852 return UPROBE_HANDLER_REMOVE;
853
854 if (!is_ret_probe(tu))
855 uprobe_perf_print(tu, 0, regs);
790 return 0; 856 return 0;
791} 857}
858
859static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
860 struct pt_regs *regs)
861{
862 uprobe_perf_print(tu, func, regs);
863}
792#endif /* CONFIG_PERF_EVENTS */ 864#endif /* CONFIG_PERF_EVENTS */
793 865
794static 866static
795int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) 867int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
796{ 868{
797 struct trace_uprobe *tu = (struct trace_uprobe *)event->data; 869 struct trace_uprobe *tu = event->data;
798 870
799 switch (type) { 871 switch (type) {
800 case TRACE_REG_REGISTER: 872 case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
843 return ret; 915 return ret;
844} 916}
845 917
918static int uretprobe_dispatcher(struct uprobe_consumer *con,
919 unsigned long func, struct pt_regs *regs)
920{
921 struct trace_uprobe *tu;
922
923 tu = container_of(con, struct trace_uprobe, consumer);
924
925 if (tu->flags & TP_FLAG_TRACE)
926 uretprobe_trace_func(tu, func, regs);
927
928#ifdef CONFIG_PERF_EVENTS
929 if (tu->flags & TP_FLAG_PROFILE)
930 uretprobe_perf_func(tu, func, regs);
931#endif
932 return 0;
933}
934
846static struct trace_event_functions uprobe_funcs = { 935static struct trace_event_functions uprobe_funcs = {
847 .trace = print_uprobe_event 936 .trace = print_uprobe_event
848}; 937};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a4592047..29f26540e9c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
112 int nr_probes = 0; 112 int nr_probes = 0;
113 struct tracepoint_func *old, *new; 113 struct tracepoint_func *old, *new;
114 114
115 WARN_ON(!probe); 115 if (WARN_ON(!probe))
116 return ERR_PTR(-EINVAL);
116 117
117 debug_print_probes(entry); 118 debug_print_probes(entry);
118 old = entry->funcs; 119 old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
152 153
153 debug_print_probes(entry); 154 debug_print_probes(entry);
154 /* (N -> M), (N > 1, M >= 0) probes */ 155 /* (N -> M), (N > 1, M >= 0) probes */
155 for (nr_probes = 0; old[nr_probes].func; nr_probes++) { 156 if (probe) {
156 if (!probe || 157 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
157 (old[nr_probes].func == probe && 158 if (old[nr_probes].func == probe &&
158 old[nr_probes].data == data)) 159 old[nr_probes].data == data)
159 nr_del++; 160 nr_del++;
161 }
160 } 162 }
161 163
164 /*
165 * If probe is NULL, then nr_probes = nr_del = 0, and then the
166 * entire entry will be removed.
167 */
162 if (nr_probes - nr_del == 0) { 168 if (nr_probes - nr_del == 0) {
163 /* N -> 0, (N > 1) */ 169 /* N -> 0, (N > 1) */
164 entry->funcs = NULL; 170 entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
173 if (new == NULL) 179 if (new == NULL)
174 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
175 for (i = 0; old[i].func; i++) 181 for (i = 0; old[i].func; i++)
176 if (probe && 182 if (old[i].func != probe || old[i].data != data)
177 (old[i].func != probe || old[i].data != data))
178 new[j++] = old[i]; 183 new[j++] = old[i];
179 new[nr_probes - nr_del].func = NULL; 184 new[nr_probes - nr_del].func = NULL;
180 entry->refcount = nr_probes - nr_del; 185 entry->refcount = nr_probes - nr_del;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb10225..f6c83d7ef000 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,67 +18,43 @@
18 18
19SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 19SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
20{ 20{
21 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 21 return sys_chown(filename, low2highuid(user), low2highgid(group));
22 /* avoid REGPARM breakage on x86: */
23 asmlinkage_protect(3, ret, filename, user, group);
24 return ret;
25} 22}
26 23
27SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 24SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
28{ 25{
29 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 26 return sys_lchown(filename, low2highuid(user), low2highgid(group));
30 /* avoid REGPARM breakage on x86: */
31 asmlinkage_protect(3, ret, filename, user, group);
32 return ret;
33} 27}
34 28
35SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) 29SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
36{ 30{
37 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 31 return sys_fchown(fd, low2highuid(user), low2highgid(group));
38 /* avoid REGPARM breakage on x86: */
39 asmlinkage_protect(3, ret, fd, user, group);
40 return ret;
41} 32}
42 33
43SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) 34SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
44{ 35{
45 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 36 return sys_setregid(low2highgid(rgid), low2highgid(egid));
46 /* avoid REGPARM breakage on x86: */
47 asmlinkage_protect(2, ret, rgid, egid);
48 return ret;
49} 37}
50 38
51SYSCALL_DEFINE1(setgid16, old_gid_t, gid) 39SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
52{ 40{
53 long ret = sys_setgid(low2highgid(gid)); 41 return sys_setgid(low2highgid(gid));
54 /* avoid REGPARM breakage on x86: */
55 asmlinkage_protect(1, ret, gid);
56 return ret;
57} 42}
58 43
59SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) 44SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
60{ 45{
61 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 46 return sys_setreuid(low2highuid(ruid), low2highuid(euid));
62 /* avoid REGPARM breakage on x86: */
63 asmlinkage_protect(2, ret, ruid, euid);
64 return ret;
65} 47}
66 48
67SYSCALL_DEFINE1(setuid16, old_uid_t, uid) 49SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
68{ 50{
69 long ret = sys_setuid(low2highuid(uid)); 51 return sys_setuid(low2highuid(uid));
70 /* avoid REGPARM breakage on x86: */
71 asmlinkage_protect(1, ret, uid);
72 return ret;
73} 52}
74 53
75SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) 54SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
76{ 55{
77 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 56 return sys_setresuid(low2highuid(ruid), low2highuid(euid),
78 low2highuid(suid)); 57 low2highuid(suid));
79 /* avoid REGPARM breakage on x86: */
80 asmlinkage_protect(3, ret, ruid, euid, suid);
81 return ret;
82} 58}
83 59
84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) 60SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
100 76
101SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) 77SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
102{ 78{
103 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 79 return sys_setresgid(low2highgid(rgid), low2highgid(egid),
104 low2highgid(sgid)); 80 low2highgid(sgid));
105 /* avoid REGPARM breakage on x86: */
106 asmlinkage_protect(3, ret, rgid, egid, sgid);
107 return ret;
108} 81}
109 82
110 83
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
127 100
128SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) 101SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
129{ 102{
130 long ret = sys_setfsuid(low2highuid(uid)); 103 return sys_setfsuid(low2highuid(uid));
131 /* avoid REGPARM breakage on x86: */
132 asmlinkage_protect(1, ret, uid);
133 return ret;
134} 104}
135 105
136SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) 106SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
137{ 107{
138 long ret = sys_setfsgid(low2highgid(gid)); 108 return sys_setfsgid(low2highgid(gid));
139 /* avoid REGPARM breakage on x86: */
140 asmlinkage_protect(1, ret, gid);
141 return ret;
142} 109}
143 110
144static int groups16_to_user(old_gid_t __user *grouplist, 111static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03b..69b4c3d48cde 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h> 19#include <linux/proc_ns.h>
20 20
21/* 21/*
22 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54 .may_mount_sysfs = true,
55 .may_mount_proc = true,
54}; 56};
55EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
56 58
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b14f4d342043..d8c30db06c5b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,7 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h> 12#include <linux/proc_ns.h>
13#include <linux/highuid.h> 13#include <linux/highuid.h>
14#include <linux/cred.h> 14#include <linux/cred.h>
15#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -25,7 +25,8 @@
25 25
26static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
27 27
28static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 28static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid,
29 struct uid_gid_map *map); 30 struct uid_gid_map *map);
30 31
31static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 32static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -61,6 +62,15 @@ int create_user_ns(struct cred *new)
61 kgid_t group = new->egid; 62 kgid_t group = new->egid;
62 int ret; 63 int ret;
63 64
65 /*
66 * Verify that we can not violate the policy of which files
67 * may be accessed that is specified by the root directory,
68 * by verifing that the root directory is at the root of the
69 * mount namespace which allows all files to be accessed.
70 */
71 if (current_chrooted())
72 return -EPERM;
73
64 /* The creator needs a mapping in the parent user namespace 74 /* The creator needs a mapping in the parent user namespace
65 * or else we won't be able to reasonably tell userspace who 75 * or else we won't be able to reasonably tell userspace who
66 * created a user_namespace. 76 * created a user_namespace.
@@ -87,6 +97,8 @@ int create_user_ns(struct cred *new)
87 97
88 set_cred_user_ns(new, ns); 98 set_cred_user_ns(new, ns);
89 99
100 update_mnt_policy(ns);
101
90 return 0; 102 return 0;
91} 103}
92 104
@@ -601,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
601 if (map->nr_extents != 0) 613 if (map->nr_extents != 0)
602 goto out; 614 goto out;
603 615
604 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID 616 /*
605 * over the user namespace in order to set the id mapping. 617 * Adjusting namespace settings requires capabilities on the target.
606 */ 618 */
607 if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) 619 if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
608 goto out; 620 goto out;
609 621
610 /* Get a buffer */ 622 /* Get a buffer */
@@ -689,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
689 701
690 ret = -EPERM; 702 ret = -EPERM;
691 /* Validate the user is allowed to use user id's mapped to. */ 703 /* Validate the user is allowed to use user id's mapped to. */
692 if (!new_idmap_permitted(ns, cap_setid, &new_map)) 704 if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
693 goto out; 705 goto out;
694 706
695 /* Map the lower ids from the parent user namespace to the 707 /* Map the lower ids from the parent user namespace to the
@@ -776,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
776 &ns->projid_map, &ns->parent->projid_map); 788 &ns->projid_map, &ns->parent->projid_map);
777} 789}
778 790
779static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 791static bool new_idmap_permitted(const struct file *file,
792 struct user_namespace *ns, int cap_setid,
780 struct uid_gid_map *new_map) 793 struct uid_gid_map *new_map)
781{ 794{
782 /* Allow mapping to your own filesystem ids */ 795 /* Allow mapping to your own filesystem ids */
@@ -784,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
784 u32 id = new_map->extent[0].lower_first; 797 u32 id = new_map->extent[0].lower_first;
785 if (cap_setid == CAP_SETUID) { 798 if (cap_setid == CAP_SETUID) {
786 kuid_t uid = make_kuid(ns->parent, id); 799 kuid_t uid = make_kuid(ns->parent, id);
787 if (uid_eq(uid, current_fsuid())) 800 if (uid_eq(uid, file->f_cred->fsuid))
788 return true; 801 return true;
789 } 802 }
790 else if (cap_setid == CAP_SETGID) { 803 else if (cap_setid == CAP_SETGID) {
791 kgid_t gid = make_kgid(ns->parent, id); 804 kgid_t gid = make_kgid(ns->parent, id);
792 if (gid_eq(gid, current_fsgid())) 805 if (gid_eq(gid, file->f_cred->fsgid))
793 return true; 806 return true;
794 } 807 }
795 } 808 }
@@ -800,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
800 813
801 /* Allow the specified ids if we have the appropriate capability 814 /* Allow the specified ids if we have the appropriate capability
802 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 815 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
816 * And the opener of the id file also had the approprpiate capability.
803 */ 817 */
804 if (ns_capable(ns->parent, cap_setid)) 818 if (ns_capable(ns->parent, cap_setid) &&
819 file_ns_capable(file, ns->parent, cap_setid))
805 return true; 820 return true;
806 821
807 return false; 822 return false;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index a47fc5de3113..2fc8576efaa8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,7 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_ns.h>
19 19
20static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
21{ 21{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4a944676358e..05039e348f07 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,
517 return ret; 517 return ret;
518 518
519 set_sample_period(); 519 set_sample_period();
520 /*
521 * Watchdog threads shouldn't be enabled if they are
522 * disabled. The 'watchdog_disabled' variable check in
523 * watchdog_*_all_cpus() function takes care of this.
524 */
520 if (watchdog_enabled && watchdog_thresh) 525 if (watchdog_enabled && watchdog_thresh)
521 watchdog_enable_all_cpus(); 526 watchdog_enable_all_cpus();
522 else 527 else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 55fac5b991b7..ee8e29a2320c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,7 +41,12 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/jhash.h>
44#include <linux/hashtable.h> 45#include <linux/hashtable.h>
46#include <linux/rculist.h>
47#include <linux/nodemask.h>
48#include <linux/moduleparam.h>
49#include <linux/uaccess.h>
45 50
46#include "workqueue_internal.h" 51#include "workqueue_internal.h"
47 52
@@ -58,12 +63,11 @@ enum {
58 * %WORKER_UNBOUND set and concurrency management disabled, and may 63 * %WORKER_UNBOUND set and concurrency management disabled, and may
59 * be executing on any CPU. The pool behaves as an unbound one. 64 * be executing on any CPU. The pool behaves as an unbound one.
60 * 65 *
61 * Note that DISASSOCIATED can be flipped only while holding 66 * Note that DISASSOCIATED should be flipped only while holding
62 * assoc_mutex to avoid changing binding state while 67 * manager_mutex to avoid changing binding state while
63 * create_worker() is in progress. 68 * create_worker() is in progress.
64 */ 69 */
65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 70 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */ 72 POOL_FREEZING = 1 << 3, /* freeze in progress */
69 73
@@ -74,12 +78,14 @@ enum {
74 WORKER_PREP = 1 << 3, /* preparing to run works */ 78 WORKER_PREP = 1 << 3, /* preparing to run works */
75 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 79 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
76 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 80 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
81 WORKER_REBOUND = 1 << 8, /* worker was rebound */
77 82
78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 83 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
79 WORKER_CPU_INTENSIVE, 84 WORKER_UNBOUND | WORKER_REBOUND,
80 85
81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 86 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
82 87
88 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 89 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
84 90
85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 91 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
@@ -97,6 +103,8 @@ enum {
97 */ 103 */
98 RESCUER_NICE_LEVEL = -20, 104 RESCUER_NICE_LEVEL = -20,
99 HIGHPRI_NICE_LEVEL = -20, 105 HIGHPRI_NICE_LEVEL = -20,
106
107 WQ_NAME_LEN = 24,
100}; 108};
101 109
102/* 110/*
@@ -115,16 +123,26 @@ enum {
115 * cpu or grabbing pool->lock is enough for read access. If 123 * cpu or grabbing pool->lock is enough for read access. If
116 * POOL_DISASSOCIATED is set, it's identical to L. 124 * POOL_DISASSOCIATED is set, it's identical to L.
117 * 125 *
118 * F: wq->flush_mutex protected. 126 * MG: pool->manager_mutex and pool->lock protected. Writes require both
127 * locks. Reads can happen under either lock.
128 *
129 * PL: wq_pool_mutex protected.
130 *
131 * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
132 *
133 * WQ: wq->mutex protected.
119 * 134 *
120 * W: workqueue_lock protected. 135 * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
136 *
137 * MD: wq_mayday_lock protected.
121 */ 138 */
122 139
123/* struct worker is defined in workqueue_internal.h */ 140/* struct worker is defined in workqueue_internal.h */
124 141
125struct worker_pool { 142struct worker_pool {
126 spinlock_t lock; /* the pool lock */ 143 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */ 144 int cpu; /* I: the associated cpu */
145 int node; /* I: the associated node ID */
128 int id; /* I: pool ID */ 146 int id; /* I: pool ID */
129 unsigned int flags; /* X: flags */ 147 unsigned int flags; /* X: flags */
130 148
@@ -138,12 +156,18 @@ struct worker_pool {
138 struct timer_list idle_timer; /* L: worker idle timeout */ 156 struct timer_list idle_timer; /* L: worker idle timeout */
139 struct timer_list mayday_timer; /* L: SOS timer for workers */ 157 struct timer_list mayday_timer; /* L: SOS timer for workers */
140 158
141 /* workers are chained either in busy_hash or idle_list */ 159 /* a workers is either on busy_hash or idle_list, or the manager */
142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); 160 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
143 /* L: hash of busy workers */ 161 /* L: hash of busy workers */
144 162
145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ 163 /* see manage_workers() for details on the two manager mutexes */
146 struct ida worker_ida; /* L: for worker IDs */ 164 struct mutex manager_arb; /* manager arbitration */
165 struct mutex manager_mutex; /* manager exclusion */
166 struct idr worker_idr; /* MG: worker IDs and iteration */
167
168 struct workqueue_attrs *attrs; /* I: worker attributes */
169 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
170 int refcnt; /* PL: refcnt for unbound pools */
147 171
148 /* 172 /*
149 * The current concurrency level. As it's likely to be accessed 173 * The current concurrency level. As it's likely to be accessed
@@ -151,6 +175,12 @@ struct worker_pool {
151 * cacheline. 175 * cacheline.
152 */ 176 */
153 atomic_t nr_running ____cacheline_aligned_in_smp; 177 atomic_t nr_running ____cacheline_aligned_in_smp;
178
179 /*
180 * Destruction of pool is sched-RCU protected to allow dereferences
181 * from get_work_pool().
182 */
183 struct rcu_head rcu;
154} ____cacheline_aligned_in_smp; 184} ____cacheline_aligned_in_smp;
155 185
156/* 186/*
@@ -164,77 +194,109 @@ struct pool_workqueue {
164 struct workqueue_struct *wq; /* I: the owning workqueue */ 194 struct workqueue_struct *wq; /* I: the owning workqueue */
165 int work_color; /* L: current color */ 195 int work_color; /* L: current color */
166 int flush_color; /* L: flushing color */ 196 int flush_color; /* L: flushing color */
197 int refcnt; /* L: reference count */
167 int nr_in_flight[WORK_NR_COLORS]; 198 int nr_in_flight[WORK_NR_COLORS];
168 /* L: nr of in_flight works */ 199 /* L: nr of in_flight works */
169 int nr_active; /* L: nr of active works */ 200 int nr_active; /* L: nr of active works */
170 int max_active; /* L: max active works */ 201 int max_active; /* L: max active works */
171 struct list_head delayed_works; /* L: delayed works */ 202 struct list_head delayed_works; /* L: delayed works */
172}; 203 struct list_head pwqs_node; /* WR: node on wq->pwqs */
204 struct list_head mayday_node; /* MD: node on wq->maydays */
205
206 /*
207 * Release of unbound pwq is punted to system_wq. See put_pwq()
208 * and pwq_unbound_release_workfn() for details. pool_workqueue
209 * itself is also sched-RCU protected so that the first pwq can be
210 * determined without grabbing wq->mutex.
211 */
212 struct work_struct unbound_release_work;
213 struct rcu_head rcu;
214} __aligned(1 << WORK_STRUCT_FLAG_BITS);
173 215
174/* 216/*
175 * Structure used to wait for workqueue flush. 217 * Structure used to wait for workqueue flush.
176 */ 218 */
177struct wq_flusher { 219struct wq_flusher {
178 struct list_head list; /* F: list of flushers */ 220 struct list_head list; /* WQ: list of flushers */
179 int flush_color; /* F: flush color waiting for */ 221 int flush_color; /* WQ: flush color waiting for */
180 struct completion done; /* flush completion */ 222 struct completion done; /* flush completion */
181}; 223};
182 224
183/* 225struct wq_device;
184 * All cpumasks are assumed to be always set on UP and thus can't be
185 * used to determine whether there's something to be done.
186 */
187#ifdef CONFIG_SMP
188typedef cpumask_var_t mayday_mask_t;
189#define mayday_test_and_set_cpu(cpu, mask) \
190 cpumask_test_and_set_cpu((cpu), (mask))
191#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
192#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
193#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
194#define free_mayday_mask(mask) free_cpumask_var((mask))
195#else
196typedef unsigned long mayday_mask_t;
197#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
198#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
199#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
200#define alloc_mayday_mask(maskp, gfp) true
201#define free_mayday_mask(mask) do { } while (0)
202#endif
203 226
204/* 227/*
205 * The externally visible workqueue abstraction is an array of 228 * The externally visible workqueue. It relays the issued work items to
206 * per-CPU workqueues: 229 * the appropriate worker_pool through its pool_workqueues.
207 */ 230 */
208struct workqueue_struct { 231struct workqueue_struct {
209 unsigned int flags; /* W: WQ_* flags */ 232 struct list_head pwqs; /* WR: all pwqs of this wq */
210 union { 233 struct list_head list; /* PL: list of all workqueues */
211 struct pool_workqueue __percpu *pcpu; 234
212 struct pool_workqueue *single; 235 struct mutex mutex; /* protects this wq */
213 unsigned long v; 236 int work_color; /* WQ: current work color */
214 } pool_wq; /* I: pwq's */ 237 int flush_color; /* WQ: current flush color */
215 struct list_head list; /* W: list of all workqueues */
216
217 struct mutex flush_mutex; /* protects wq flushing */
218 int work_color; /* F: current work color */
219 int flush_color; /* F: current flush color */
220 atomic_t nr_pwqs_to_flush; /* flush in progress */ 238 atomic_t nr_pwqs_to_flush; /* flush in progress */
221 struct wq_flusher *first_flusher; /* F: first flusher */ 239 struct wq_flusher *first_flusher; /* WQ: first flusher */
222 struct list_head flusher_queue; /* F: flush waiters */ 240 struct list_head flusher_queue; /* WQ: flush waiters */
223 struct list_head flusher_overflow; /* F: flush overflow list */ 241 struct list_head flusher_overflow; /* WQ: flush overflow list */
224 242
225 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 243 struct list_head maydays; /* MD: pwqs requesting rescue */
226 struct worker *rescuer; /* I: rescue worker */ 244 struct worker *rescuer; /* I: rescue worker */
227 245
228 int nr_drainers; /* W: drain in progress */ 246 int nr_drainers; /* WQ: drain in progress */
229 int saved_max_active; /* W: saved pwq max_active */ 247 int saved_max_active; /* WQ: saved pwq max_active */
248
249 struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
250 struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
251
252#ifdef CONFIG_SYSFS
253 struct wq_device *wq_dev; /* I: for sysfs interface */
254#endif
230#ifdef CONFIG_LOCKDEP 255#ifdef CONFIG_LOCKDEP
231 struct lockdep_map lockdep_map; 256 struct lockdep_map lockdep_map;
232#endif 257#endif
233 char name[]; /* I: workqueue name */ 258 char name[WQ_NAME_LEN]; /* I: workqueue name */
259
260 /* hot fields used during command issue, aligned to cacheline */
261 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
262 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
263 struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
234}; 264};
235 265
266static struct kmem_cache *pwq_cache;
267
268static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
269static cpumask_var_t *wq_numa_possible_cpumask;
270 /* possible CPUs of each node */
271
272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
278static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
279
280static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
281static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
282
283static LIST_HEAD(workqueues); /* PL: list of all workqueues */
284static bool workqueue_freezing; /* PL: have wqs started freezing? */
285
286/* the per-cpu worker pools */
287static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
288 cpu_worker_pools);
289
290static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
291
292/* PL: hash of all unbound pools keyed by pool->attrs */
293static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
294
295/* I: attributes used when instantiating standard unbound pools on demand */
296static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
297
236struct workqueue_struct *system_wq __read_mostly; 298struct workqueue_struct *system_wq __read_mostly;
237EXPORT_SYMBOL_GPL(system_wq); 299EXPORT_SYMBOL(system_wq);
238struct workqueue_struct *system_highpri_wq __read_mostly; 300struct workqueue_struct *system_highpri_wq __read_mostly;
239EXPORT_SYMBOL_GPL(system_highpri_wq); 301EXPORT_SYMBOL_GPL(system_highpri_wq);
240struct workqueue_struct *system_long_wq __read_mostly; 302struct workqueue_struct *system_long_wq __read_mostly;
@@ -244,64 +306,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
244struct workqueue_struct *system_freezable_wq __read_mostly; 306struct workqueue_struct *system_freezable_wq __read_mostly;
245EXPORT_SYMBOL_GPL(system_freezable_wq); 307EXPORT_SYMBOL_GPL(system_freezable_wq);
246 308
309static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to,
311 const struct workqueue_attrs *from);
312
247#define CREATE_TRACE_POINTS 313#define CREATE_TRACE_POINTS
248#include <trace/events/workqueue.h> 314#include <trace/events/workqueue.h>
249 315
250#define for_each_std_worker_pool(pool, cpu) \ 316#define assert_rcu_or_pool_mutex() \
251 for ((pool) = &std_worker_pools(cpu)[0]; \ 317 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) 318 lockdep_is_held(&wq_pool_mutex), \
319 "sched RCU or wq_pool_mutex should be held")
253 320
254#define for_each_busy_worker(worker, i, pool) \ 321#define assert_rcu_or_wq_mutex(wq) \
255 hash_for_each(pool->busy_hash, i, worker, hentry) 322 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
323 lockdep_is_held(&wq->mutex), \
324 "sched RCU or wq->mutex should be held")
256 325
257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 326#ifdef CONFIG_LOCKDEP
258 unsigned int sw) 327#define assert_manager_or_pool_lock(pool) \
259{ 328 WARN_ONCE(debug_locks && \
260 if (cpu < nr_cpu_ids) { 329 !lockdep_is_held(&(pool)->manager_mutex) && \
261 if (sw & 1) { 330 !lockdep_is_held(&(pool)->lock), \
262 cpu = cpumask_next(cpu, mask); 331 "pool->manager_mutex or ->lock should be held")
263 if (cpu < nr_cpu_ids) 332#else
264 return cpu; 333#define assert_manager_or_pool_lock(pool) do { } while (0)
265 } 334#endif
266 if (sw & 2)
267 return WORK_CPU_UNBOUND;
268 }
269 return WORK_CPU_END;
270}
271 335
272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, 336#define for_each_cpu_worker_pool(pool, cpu) \
273 struct workqueue_struct *wq) 337 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
274{ 338 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 339 (pool)++)
276}
277 340
278/* 341/**
279 * CPU iterators 342 * for_each_pool - iterate through all worker_pools in the system
343 * @pool: iteration cursor
344 * @pi: integer used for iteration
280 * 345 *
281 * An extra cpu number is defined using an invalid cpu number 346 * This must be called either with wq_pool_mutex held or sched RCU read
282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 347 * locked. If the pool needs to be used beyond the locking in effect, the
283 * specific CPU. The following iterators are similar to for_each_*_cpu() 348 * caller is responsible for guaranteeing that the pool stays online.
284 * iterators but also considers the unbound CPU.
285 * 349 *
286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND 350 * The if/else clause exists only for the lockdep assertion and can be
287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND 351 * ignored.
288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
289 * WORK_CPU_UNBOUND for unbound workqueues
290 */ 352 */
291#define for_each_wq_cpu(cpu) \ 353#define for_each_pool(pool, pi) \
292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ 354 idr_for_each_entry(&worker_pool_idr, pool, pi) \
293 (cpu) < WORK_CPU_END; \ 355 if (({ assert_rcu_or_pool_mutex(); false; })) { } \
294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) 356 else
295 357
296#define for_each_online_wq_cpu(cpu) \ 358/**
297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ 359 * for_each_pool_worker - iterate through all workers of a worker_pool
298 (cpu) < WORK_CPU_END; \ 360 * @worker: iteration cursor
299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) 361 * @wi: integer used for iteration
362 * @pool: worker_pool to iterate workers of
363 *
364 * This must be called with either @pool->manager_mutex or ->lock held.
365 *
366 * The if/else clause exists only for the lockdep assertion and can be
367 * ignored.
368 */
369#define for_each_pool_worker(worker, wi, pool) \
370 idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
371 if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
372 else
300 373
301#define for_each_pwq_cpu(cpu, wq) \ 374/**
302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ 375 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
303 (cpu) < WORK_CPU_END; \ 376 * @pwq: iteration cursor
304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) 377 * @wq: the target workqueue
378 *
379 * This must be called either with wq->mutex held or sched RCU read locked.
380 * If the pwq needs to be used beyond the locking in effect, the caller is
381 * responsible for guaranteeing that the pwq stays online.
382 *
383 * The if/else clause exists only for the lockdep assertion and can be
384 * ignored.
385 */
386#define for_each_pwq(pwq, wq) \
387 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
388 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
389 else
305 390
306#ifdef CONFIG_DEBUG_OBJECTS_WORK 391#ifdef CONFIG_DEBUG_OBJECTS_WORK
307 392
@@ -419,77 +504,35 @@ static inline void debug_work_activate(struct work_struct *work) { }
419static inline void debug_work_deactivate(struct work_struct *work) { } 504static inline void debug_work_deactivate(struct work_struct *work) { }
420#endif 505#endif
421 506
422/* Serializes the accesses to the list of workqueues. */
423static DEFINE_SPINLOCK(workqueue_lock);
424static LIST_HEAD(workqueues);
425static bool workqueue_freezing; /* W: have wqs started freezing? */
426
427/*
428 * The CPU and unbound standard worker pools. The unbound ones have
429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
430 */
431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
434
435/* idr of all pools */
436static DEFINE_MUTEX(worker_pool_idr_mutex);
437static DEFINE_IDR(worker_pool_idr);
438
439static int worker_thread(void *__worker);
440
441static struct worker_pool *std_worker_pools(int cpu)
442{
443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
447}
448
449static int std_worker_pool_pri(struct worker_pool *pool)
450{
451 return pool - std_worker_pools(pool->cpu);
452}
453
454/* allocate ID and assign it to @pool */ 507/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool) 508static int worker_pool_assign_id(struct worker_pool *pool)
456{ 509{
457 int ret; 510 int ret;
458 511
459 mutex_lock(&worker_pool_idr_mutex); 512 lockdep_assert_held(&wq_pool_mutex);
513
460 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); 514 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
461 if (ret >= 0) 515 if (ret >= 0) {
462 pool->id = ret; 516 pool->id = ret;
463 mutex_unlock(&worker_pool_idr_mutex); 517 return 0;
464 518 }
465 return ret < 0 ? ret : 0; 519 return ret;
466} 520}
467 521
468/* 522/**
469 * Lookup worker_pool by id. The idr currently is built during boot and 523 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
470 * never modified. Don't worry about locking for now. 524 * @wq: the target workqueue
525 * @node: the node ID
526 *
527 * This must be called either with pwq_lock held or sched RCU read locked.
528 * If the pwq needs to be used beyond the locking in effect, the caller is
529 * responsible for guaranteeing that the pwq stays online.
471 */ 530 */
472static struct worker_pool *worker_pool_by_id(int pool_id) 531static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
532 int node)
473{ 533{
474 return idr_find(&worker_pool_idr, pool_id); 534 assert_rcu_or_wq_mutex(wq);
475} 535 return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
476
477static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
478{
479 struct worker_pool *pools = std_worker_pools(cpu);
480
481 return &pools[highpri];
482}
483
484static struct pool_workqueue *get_pwq(unsigned int cpu,
485 struct workqueue_struct *wq)
486{
487 if (!(wq->flags & WQ_UNBOUND)) {
488 if (likely(cpu < nr_cpu_ids))
489 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
490 } else if (likely(cpu == WORK_CPU_UNBOUND))
491 return wq->pool_wq.single;
492 return NULL;
493} 536}
494 537
495static unsigned int work_color_to_flags(int color) 538static unsigned int work_color_to_flags(int color)
@@ -531,7 +574,7 @@ static int work_next_color(int color)
531static inline void set_work_data(struct work_struct *work, unsigned long data, 574static inline void set_work_data(struct work_struct *work, unsigned long data,
532 unsigned long flags) 575 unsigned long flags)
533{ 576{
534 BUG_ON(!work_pending(work)); 577 WARN_ON_ONCE(!work_pending(work));
535 atomic_long_set(&work->data, data | flags | work_static(work)); 578 atomic_long_set(&work->data, data | flags | work_static(work));
536} 579}
537 580
@@ -583,13 +626,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
583 * @work: the work item of interest 626 * @work: the work item of interest
584 * 627 *
585 * Return the worker_pool @work was last associated with. %NULL if none. 628 * Return the worker_pool @work was last associated with. %NULL if none.
629 *
630 * Pools are created and destroyed under wq_pool_mutex, and allows read
631 * access under sched-RCU read lock. As such, this function should be
632 * called under wq_pool_mutex or with preemption disabled.
633 *
634 * All fields of the returned pool are accessible as long as the above
635 * mentioned locking is in effect. If the returned pool needs to be used
636 * beyond the critical section, the caller is responsible for ensuring the
637 * returned pool is and stays online.
586 */ 638 */
587static struct worker_pool *get_work_pool(struct work_struct *work) 639static struct worker_pool *get_work_pool(struct work_struct *work)
588{ 640{
589 unsigned long data = atomic_long_read(&work->data); 641 unsigned long data = atomic_long_read(&work->data);
590 struct worker_pool *pool;
591 int pool_id; 642 int pool_id;
592 643
644 assert_rcu_or_pool_mutex();
645
593 if (data & WORK_STRUCT_PWQ) 646 if (data & WORK_STRUCT_PWQ)
594 return ((struct pool_workqueue *) 647 return ((struct pool_workqueue *)
595 (data & WORK_STRUCT_WQ_DATA_MASK))->pool; 648 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
@@ -598,9 +651,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
598 if (pool_id == WORK_OFFQ_POOL_NONE) 651 if (pool_id == WORK_OFFQ_POOL_NONE)
599 return NULL; 652 return NULL;
600 653
601 pool = worker_pool_by_id(pool_id); 654 return idr_find(&worker_pool_idr, pool_id);
602 WARN_ON_ONCE(!pool);
603 return pool;
604} 655}
605 656
606/** 657/**
@@ -689,7 +740,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
689/* Do we have too many workers and should some go away? */ 740/* Do we have too many workers and should some go away? */
690static bool too_many_workers(struct worker_pool *pool) 741static bool too_many_workers(struct worker_pool *pool)
691{ 742{
692 bool managing = pool->flags & POOL_MANAGING_WORKERS; 743 bool managing = mutex_is_locked(&pool->manager_arb);
693 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 744 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
694 int nr_busy = pool->nr_workers - nr_idle; 745 int nr_busy = pool->nr_workers - nr_idle;
695 746
@@ -744,7 +795,7 @@ static void wake_up_worker(struct worker_pool *pool)
744 * CONTEXT: 795 * CONTEXT:
745 * spin_lock_irq(rq->lock) 796 * spin_lock_irq(rq->lock)
746 */ 797 */
747void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) 798void wq_worker_waking_up(struct task_struct *task, int cpu)
748{ 799{
749 struct worker *worker = kthread_data(task); 800 struct worker *worker = kthread_data(task);
750 801
@@ -769,8 +820,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
769 * RETURNS: 820 * RETURNS:
770 * Worker task on @cpu to wake up, %NULL if none. 821 * Worker task on @cpu to wake up, %NULL if none.
771 */ 822 */
772struct task_struct *wq_worker_sleeping(struct task_struct *task, 823struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
773 unsigned int cpu)
774{ 824{
775 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 825 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
776 struct worker_pool *pool; 826 struct worker_pool *pool;
@@ -786,7 +836,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
786 pool = worker->pool; 836 pool = worker->pool;
787 837
788 /* this can only happen on the local cpu */ 838 /* this can only happen on the local cpu */
789 BUG_ON(cpu != raw_smp_processor_id()); 839 if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
840 return NULL;
790 841
791 /* 842 /*
792 * The counterpart of the following dec_and_test, implied mb, 843 * The counterpart of the following dec_and_test, implied mb,
@@ -891,13 +942,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
891 * recycled work item as currently executing and make it wait until the 942 * recycled work item as currently executing and make it wait until the
892 * current execution finishes, introducing an unwanted dependency. 943 * current execution finishes, introducing an unwanted dependency.
893 * 944 *
894 * This function checks the work item address, work function and workqueue 945 * This function checks the work item address and work function to avoid
895 * to avoid false positives. Note that this isn't complete as one may 946 * false positives. Note that this isn't complete as one may construct a
896 * construct a work function which can introduce dependency onto itself 947 * work function which can introduce dependency onto itself through a
897 * through a recycled work item. Well, if somebody wants to shoot oneself 948 * recycled work item. Well, if somebody wants to shoot oneself in the
898 * in the foot that badly, there's only so much we can do, and if such 949 * foot that badly, there's only so much we can do, and if such deadlock
899 * deadlock actually occurs, it should be easy to locate the culprit work 950 * actually occurs, it should be easy to locate the culprit work function.
900 * function.
901 * 951 *
902 * CONTEXT: 952 * CONTEXT:
903 * spin_lock_irq(pool->lock). 953 * spin_lock_irq(pool->lock).
@@ -961,6 +1011,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
961 *nextp = n; 1011 *nextp = n;
962} 1012}
963 1013
1014/**
1015 * get_pwq - get an extra reference on the specified pool_workqueue
1016 * @pwq: pool_workqueue to get
1017 *
1018 * Obtain an extra reference on @pwq. The caller should guarantee that
1019 * @pwq has positive refcnt and be holding the matching pool->lock.
1020 */
1021static void get_pwq(struct pool_workqueue *pwq)
1022{
1023 lockdep_assert_held(&pwq->pool->lock);
1024 WARN_ON_ONCE(pwq->refcnt <= 0);
1025 pwq->refcnt++;
1026}
1027
1028/**
1029 * put_pwq - put a pool_workqueue reference
1030 * @pwq: pool_workqueue to put
1031 *
1032 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
1033 * destruction. The caller should be holding the matching pool->lock.
1034 */
1035static void put_pwq(struct pool_workqueue *pwq)
1036{
1037 lockdep_assert_held(&pwq->pool->lock);
1038 if (likely(--pwq->refcnt))
1039 return;
1040 if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
1041 return;
1042 /*
1043 * @pwq can't be released under pool->lock, bounce to
1044 * pwq_unbound_release_workfn(). This never recurses on the same
1045 * pool->lock as this path is taken only for unbound workqueues and
1046 * the release work item is scheduled on a per-cpu workqueue. To
1047 * avoid lockdep warning, unbound pool->locks are given lockdep
1048 * subclass of 1 in get_unbound_pool().
1049 */
1050 schedule_work(&pwq->unbound_release_work);
1051}
1052
1053/**
1054 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1055 * @pwq: pool_workqueue to put (can be %NULL)
1056 *
1057 * put_pwq() with locking. This function also allows %NULL @pwq.
1058 */
1059static void put_pwq_unlocked(struct pool_workqueue *pwq)
1060{
1061 if (pwq) {
1062 /*
1063 * As both pwqs and pools are sched-RCU protected, the
1064 * following lock operations are safe.
1065 */
1066 spin_lock_irq(&pwq->pool->lock);
1067 put_pwq(pwq);
1068 spin_unlock_irq(&pwq->pool->lock);
1069 }
1070}
1071
964static void pwq_activate_delayed_work(struct work_struct *work) 1072static void pwq_activate_delayed_work(struct work_struct *work)
965{ 1073{
966 struct pool_workqueue *pwq = get_work_pwq(work); 1074 struct pool_workqueue *pwq = get_work_pwq(work);
@@ -992,9 +1100,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
992 */ 1100 */
993static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) 1101static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
994{ 1102{
995 /* ignore uncolored works */ 1103 /* uncolored work items don't participate in flushing or nr_active */
996 if (color == WORK_NO_COLOR) 1104 if (color == WORK_NO_COLOR)
997 return; 1105 goto out_put;
998 1106
999 pwq->nr_in_flight[color]--; 1107 pwq->nr_in_flight[color]--;
1000 1108
@@ -1007,11 +1115,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1007 1115
1008 /* is flush in progress and are we at the flushing tip? */ 1116 /* is flush in progress and are we at the flushing tip? */
1009 if (likely(pwq->flush_color != color)) 1117 if (likely(pwq->flush_color != color))
1010 return; 1118 goto out_put;
1011 1119
1012 /* are there still in-flight works? */ 1120 /* are there still in-flight works? */
1013 if (pwq->nr_in_flight[color]) 1121 if (pwq->nr_in_flight[color])
1014 return; 1122 goto out_put;
1015 1123
1016 /* this pwq is done, clear flush_color */ 1124 /* this pwq is done, clear flush_color */
1017 pwq->flush_color = -1; 1125 pwq->flush_color = -1;
@@ -1022,6 +1130,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1022 */ 1130 */
1023 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) 1131 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1024 complete(&pwq->wq->first_flusher->done); 1132 complete(&pwq->wq->first_flusher->done);
1133out_put:
1134 put_pwq(pwq);
1025} 1135}
1026 1136
1027/** 1137/**
@@ -1144,11 +1254,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1144 /* we own @work, set data and link */ 1254 /* we own @work, set data and link */
1145 set_work_pwq(work, pwq, extra_flags); 1255 set_work_pwq(work, pwq, extra_flags);
1146 list_add_tail(&work->entry, head); 1256 list_add_tail(&work->entry, head);
1257 get_pwq(pwq);
1147 1258
1148 /* 1259 /*
1149 * Ensure either worker_sched_deactivated() sees the above 1260 * Ensure either wq_worker_sleeping() sees the above
1150 * list_add_tail() or we see zero nr_running to avoid workers 1261 * list_add_tail() or we see zero nr_running to avoid workers lying
1151 * lying around lazily while there are works to be processed. 1262 * around lazily while there are works to be processed.
1152 */ 1263 */
1153 smp_mb(); 1264 smp_mb();
1154 1265
@@ -1172,10 +1283,11 @@ static bool is_chained_work(struct workqueue_struct *wq)
1172 return worker && worker->current_pwq->wq == wq; 1283 return worker && worker->current_pwq->wq == wq;
1173} 1284}
1174 1285
1175static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1286static void __queue_work(int cpu, struct workqueue_struct *wq,
1176 struct work_struct *work) 1287 struct work_struct *work)
1177{ 1288{
1178 struct pool_workqueue *pwq; 1289 struct pool_workqueue *pwq;
1290 struct worker_pool *last_pool;
1179 struct list_head *worklist; 1291 struct list_head *worklist;
1180 unsigned int work_flags; 1292 unsigned int work_flags;
1181 unsigned int req_cpu = cpu; 1293 unsigned int req_cpu = cpu;
@@ -1191,48 +1303,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1191 debug_work_activate(work); 1303 debug_work_activate(work);
1192 1304
1193 /* if dying, only works from the same workqueue are allowed */ 1305 /* if dying, only works from the same workqueue are allowed */
1194 if (unlikely(wq->flags & WQ_DRAINING) && 1306 if (unlikely(wq->flags & __WQ_DRAINING) &&
1195 WARN_ON_ONCE(!is_chained_work(wq))) 1307 WARN_ON_ONCE(!is_chained_work(wq)))
1196 return; 1308 return;
1309retry:
1310 if (req_cpu == WORK_CPU_UNBOUND)
1311 cpu = raw_smp_processor_id();
1197 1312
1198 /* determine the pwq to use */ 1313 /* pwq which will be used unless @work is executing elsewhere */
1199 if (!(wq->flags & WQ_UNBOUND)) { 1314 if (!(wq->flags & WQ_UNBOUND))
1200 struct worker_pool *last_pool; 1315 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
1201 1316 else
1202 if (cpu == WORK_CPU_UNBOUND) 1317 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
1203 cpu = raw_smp_processor_id();
1204
1205 /*
1206 * It's multi cpu. If @work was previously on a different
1207 * cpu, it might still be running there, in which case the
1208 * work needs to be queued on that cpu to guarantee
1209 * non-reentrancy.
1210 */
1211 pwq = get_pwq(cpu, wq);
1212 last_pool = get_work_pool(work);
1213 1318
1214 if (last_pool && last_pool != pwq->pool) { 1319 /*
1215 struct worker *worker; 1320 * If @work was previously on a different pool, it might still be
1321 * running there, in which case the work needs to be queued on that
1322 * pool to guarantee non-reentrancy.
1323 */
1324 last_pool = get_work_pool(work);
1325 if (last_pool && last_pool != pwq->pool) {
1326 struct worker *worker;
1216 1327
1217 spin_lock(&last_pool->lock); 1328 spin_lock(&last_pool->lock);
1218 1329
1219 worker = find_worker_executing_work(last_pool, work); 1330 worker = find_worker_executing_work(last_pool, work);
1220 1331
1221 if (worker && worker->current_pwq->wq == wq) { 1332 if (worker && worker->current_pwq->wq == wq) {
1222 pwq = get_pwq(last_pool->cpu, wq); 1333 pwq = worker->current_pwq;
1223 } else {
1224 /* meh... not running there, queue here */
1225 spin_unlock(&last_pool->lock);
1226 spin_lock(&pwq->pool->lock);
1227 }
1228 } else { 1334 } else {
1335 /* meh... not running there, queue here */
1336 spin_unlock(&last_pool->lock);
1229 spin_lock(&pwq->pool->lock); 1337 spin_lock(&pwq->pool->lock);
1230 } 1338 }
1231 } else { 1339 } else {
1232 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1233 spin_lock(&pwq->pool->lock); 1340 spin_lock(&pwq->pool->lock);
1234 } 1341 }
1235 1342
1343 /*
1344 * pwq is determined and locked. For unbound pools, we could have
1345 * raced with pwq release and it could already be dead. If its
1346 * refcnt is zero, repeat pwq selection. Note that pwqs never die
1347 * without another pwq replacing it in the numa_pwq_tbl or while
1348 * work items are executing on it, so the retrying is guaranteed to
1349 * make forward-progress.
1350 */
1351 if (unlikely(!pwq->refcnt)) {
1352 if (wq->flags & WQ_UNBOUND) {
1353 spin_unlock(&pwq->pool->lock);
1354 cpu_relax();
1355 goto retry;
1356 }
1357 /* oops */
1358 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
1359 wq->name, cpu);
1360 }
1361
1236 /* pwq determined, queue */ 1362 /* pwq determined, queue */
1237 trace_workqueue_queue_work(req_cpu, pwq, work); 1363 trace_workqueue_queue_work(req_cpu, pwq, work);
1238 1364
@@ -1285,23 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
1285 local_irq_restore(flags); 1411 local_irq_restore(flags);
1286 return ret; 1412 return ret;
1287} 1413}
1288EXPORT_SYMBOL_GPL(queue_work_on); 1414EXPORT_SYMBOL(queue_work_on);
1289
1290/**
1291 * queue_work - queue work on a workqueue
1292 * @wq: workqueue to use
1293 * @work: work to queue
1294 *
1295 * Returns %false if @work was already on a queue, %true otherwise.
1296 *
1297 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1298 * it can be processed by another CPU.
1299 */
1300bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1301{
1302 return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1303}
1304EXPORT_SYMBOL_GPL(queue_work);
1305 1415
1306void delayed_work_timer_fn(unsigned long __data) 1416void delayed_work_timer_fn(unsigned long __data)
1307{ 1417{
@@ -1375,22 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1375 local_irq_restore(flags); 1485 local_irq_restore(flags);
1376 return ret; 1486 return ret;
1377} 1487}
1378EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1488EXPORT_SYMBOL(queue_delayed_work_on);
1379
1380/**
1381 * queue_delayed_work - queue work on a workqueue after delay
1382 * @wq: workqueue to use
1383 * @dwork: delayable work to queue
1384 * @delay: number of jiffies to wait before queueing
1385 *
1386 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1387 */
1388bool queue_delayed_work(struct workqueue_struct *wq,
1389 struct delayed_work *dwork, unsigned long delay)
1390{
1391 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1392}
1393EXPORT_SYMBOL_GPL(queue_delayed_work);
1394 1489
1395/** 1490/**
1396 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1491 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
@@ -1431,21 +1526,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1431EXPORT_SYMBOL_GPL(mod_delayed_work_on); 1526EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1432 1527
1433/** 1528/**
1434 * mod_delayed_work - modify delay of or queue a delayed work
1435 * @wq: workqueue to use
1436 * @dwork: work to queue
1437 * @delay: number of jiffies to wait before queueing
1438 *
1439 * mod_delayed_work_on() on local CPU.
1440 */
1441bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1442 unsigned long delay)
1443{
1444 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1445}
1446EXPORT_SYMBOL_GPL(mod_delayed_work);
1447
1448/**
1449 * worker_enter_idle - enter idle state 1529 * worker_enter_idle - enter idle state
1450 * @worker: worker which is entering idle state 1530 * @worker: worker which is entering idle state
1451 * 1531 *
@@ -1459,9 +1539,10 @@ static void worker_enter_idle(struct worker *worker)
1459{ 1539{
1460 struct worker_pool *pool = worker->pool; 1540 struct worker_pool *pool = worker->pool;
1461 1541
1462 BUG_ON(worker->flags & WORKER_IDLE); 1542 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1463 BUG_ON(!list_empty(&worker->entry) && 1543 WARN_ON_ONCE(!list_empty(&worker->entry) &&
1464 (worker->hentry.next || worker->hentry.pprev)); 1544 (worker->hentry.next || worker->hentry.pprev)))
1545 return;
1465 1546
1466 /* can't use worker_set_flags(), also called from start_worker() */ 1547 /* can't use worker_set_flags(), also called from start_worker() */
1467 worker->flags |= WORKER_IDLE; 1548 worker->flags |= WORKER_IDLE;
@@ -1498,22 +1579,25 @@ static void worker_leave_idle(struct worker *worker)
1498{ 1579{
1499 struct worker_pool *pool = worker->pool; 1580 struct worker_pool *pool = worker->pool;
1500 1581
1501 BUG_ON(!(worker->flags & WORKER_IDLE)); 1582 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1583 return;
1502 worker_clr_flags(worker, WORKER_IDLE); 1584 worker_clr_flags(worker, WORKER_IDLE);
1503 pool->nr_idle--; 1585 pool->nr_idle--;
1504 list_del_init(&worker->entry); 1586 list_del_init(&worker->entry);
1505} 1587}
1506 1588
1507/** 1589/**
1508 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool 1590 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1509 * @worker: self 1591 * @pool: target worker_pool
1592 *
1593 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1510 * 1594 *
1511 * Works which are scheduled while the cpu is online must at least be 1595 * Works which are scheduled while the cpu is online must at least be
1512 * scheduled to a worker which is bound to the cpu so that if they are 1596 * scheduled to a worker which is bound to the cpu so that if they are
1513 * flushed from cpu callbacks while cpu is going down, they are 1597 * flushed from cpu callbacks while cpu is going down, they are
1514 * guaranteed to execute on the cpu. 1598 * guaranteed to execute on the cpu.
1515 * 1599 *
1516 * This function is to be used by rogue workers and rescuers to bind 1600 * This function is to be used by unbound workers and rescuers to bind
1517 * themselves to the target cpu and may race with cpu going down or 1601 * themselves to the target cpu and may race with cpu going down or
1518 * coming online. kthread_bind() can't be used because it may put the 1602 * coming online. kthread_bind() can't be used because it may put the
1519 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1603 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
@@ -1534,12 +1618,9 @@ static void worker_leave_idle(struct worker *worker)
1534 * %true if the associated pool is online (@worker is successfully 1618 * %true if the associated pool is online (@worker is successfully
1535 * bound), %false if offline. 1619 * bound), %false if offline.
1536 */ 1620 */
1537static bool worker_maybe_bind_and_lock(struct worker *worker) 1621static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1538__acquires(&pool->lock) 1622__acquires(&pool->lock)
1539{ 1623{
1540 struct worker_pool *pool = worker->pool;
1541 struct task_struct *task = worker->task;
1542
1543 while (true) { 1624 while (true) {
1544 /* 1625 /*
1545 * The following call may fail, succeed or succeed 1626 * The following call may fail, succeed or succeed
@@ -1548,14 +1629,13 @@ __acquires(&pool->lock)
1548 * against POOL_DISASSOCIATED. 1629 * against POOL_DISASSOCIATED.
1549 */ 1630 */
1550 if (!(pool->flags & POOL_DISASSOCIATED)) 1631 if (!(pool->flags & POOL_DISASSOCIATED))
1551 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); 1632 set_cpus_allowed_ptr(current, pool->attrs->cpumask);
1552 1633
1553 spin_lock_irq(&pool->lock); 1634 spin_lock_irq(&pool->lock);
1554 if (pool->flags & POOL_DISASSOCIATED) 1635 if (pool->flags & POOL_DISASSOCIATED)
1555 return false; 1636 return false;
1556 if (task_cpu(task) == pool->cpu && 1637 if (task_cpu(current) == pool->cpu &&
1557 cpumask_equal(&current->cpus_allowed, 1638 cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
1558 get_cpu_mask(pool->cpu)))
1559 return true; 1639 return true;
1560 spin_unlock_irq(&pool->lock); 1640 spin_unlock_irq(&pool->lock);
1561 1641
@@ -1570,108 +1650,6 @@ __acquires(&pool->lock)
1570 } 1650 }
1571} 1651}
1572 1652
1573/*
1574 * Rebind an idle @worker to its CPU. worker_thread() will test
1575 * list_empty(@worker->entry) before leaving idle and call this function.
1576 */
1577static void idle_worker_rebind(struct worker *worker)
1578{
1579 /* CPU may go down again inbetween, clear UNBOUND only on success */
1580 if (worker_maybe_bind_and_lock(worker))
1581 worker_clr_flags(worker, WORKER_UNBOUND);
1582
1583 /* rebind complete, become available again */
1584 list_add(&worker->entry, &worker->pool->idle_list);
1585 spin_unlock_irq(&worker->pool->lock);
1586}
1587
1588/*
1589 * Function for @worker->rebind.work used to rebind unbound busy workers to
1590 * the associated cpu which is coming back online. This is scheduled by
1591 * cpu up but can race with other cpu hotplug operations and may be
1592 * executed twice without intervening cpu down.
1593 */
1594static void busy_worker_rebind_fn(struct work_struct *work)
1595{
1596 struct worker *worker = container_of(work, struct worker, rebind_work);
1597
1598 if (worker_maybe_bind_and_lock(worker))
1599 worker_clr_flags(worker, WORKER_UNBOUND);
1600
1601 spin_unlock_irq(&worker->pool->lock);
1602}
1603
1604/**
1605 * rebind_workers - rebind all workers of a pool to the associated CPU
1606 * @pool: pool of interest
1607 *
1608 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1609 * is different for idle and busy ones.
1610 *
1611 * Idle ones will be removed from the idle_list and woken up. They will
1612 * add themselves back after completing rebind. This ensures that the
1613 * idle_list doesn't contain any unbound workers when re-bound busy workers
1614 * try to perform local wake-ups for concurrency management.
1615 *
1616 * Busy workers can rebind after they finish their current work items.
1617 * Queueing the rebind work item at the head of the scheduled list is
1618 * enough. Note that nr_running will be properly bumped as busy workers
1619 * rebind.
1620 *
1621 * On return, all non-manager workers are scheduled for rebind - see
1622 * manage_workers() for the manager special case. Any idle worker
1623 * including the manager will not appear on @idle_list until rebind is
1624 * complete, making local wake-ups safe.
1625 */
1626static void rebind_workers(struct worker_pool *pool)
1627{
1628 struct worker *worker, *n;
1629 int i;
1630
1631 lockdep_assert_held(&pool->assoc_mutex);
1632 lockdep_assert_held(&pool->lock);
1633
1634 /* dequeue and kick idle ones */
1635 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1636 /*
1637 * idle workers should be off @pool->idle_list until rebind
1638 * is complete to avoid receiving premature local wake-ups.
1639 */
1640 list_del_init(&worker->entry);
1641
1642 /*
1643 * worker_thread() will see the above dequeuing and call
1644 * idle_worker_rebind().
1645 */
1646 wake_up_process(worker->task);
1647 }
1648
1649 /* rebind busy workers */
1650 for_each_busy_worker(worker, i, pool) {
1651 struct work_struct *rebind_work = &worker->rebind_work;
1652 struct workqueue_struct *wq;
1653
1654 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1655 work_data_bits(rebind_work)))
1656 continue;
1657
1658 debug_work_activate(rebind_work);
1659
1660 /*
1661 * wq doesn't really matter but let's keep @worker->pool
1662 * and @pwq->pool consistent for sanity.
1663 */
1664 if (std_worker_pool_pri(worker->pool))
1665 wq = system_highpri_wq;
1666 else
1667 wq = system_wq;
1668
1669 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1670 worker->scheduled.next,
1671 work_color_to_flags(WORK_NO_COLOR));
1672 }
1673}
1674
1675static struct worker *alloc_worker(void) 1653static struct worker *alloc_worker(void)
1676{ 1654{
1677 struct worker *worker; 1655 struct worker *worker;
@@ -1680,7 +1658,6 @@ static struct worker *alloc_worker(void)
1680 if (worker) { 1658 if (worker) {
1681 INIT_LIST_HEAD(&worker->entry); 1659 INIT_LIST_HEAD(&worker->entry);
1682 INIT_LIST_HEAD(&worker->scheduled); 1660 INIT_LIST_HEAD(&worker->scheduled);
1683 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1684 /* on creation a worker is in !idle && prep state */ 1661 /* on creation a worker is in !idle && prep state */
1685 worker->flags = WORKER_PREP; 1662 worker->flags = WORKER_PREP;
1686 } 1663 }
@@ -1703,18 +1680,25 @@ static struct worker *alloc_worker(void)
1703 */ 1680 */
1704static struct worker *create_worker(struct worker_pool *pool) 1681static struct worker *create_worker(struct worker_pool *pool)
1705{ 1682{
1706 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1707 struct worker *worker = NULL; 1683 struct worker *worker = NULL;
1708 int id = -1; 1684 int id = -1;
1685 char id_buf[16];
1686
1687 lockdep_assert_held(&pool->manager_mutex);
1709 1688
1689 /*
1690 * ID is needed to determine kthread name. Allocate ID first
1691 * without installing the pointer.
1692 */
1693 idr_preload(GFP_KERNEL);
1710 spin_lock_irq(&pool->lock); 1694 spin_lock_irq(&pool->lock);
1711 while (ida_get_new(&pool->worker_ida, &id)) { 1695
1712 spin_unlock_irq(&pool->lock); 1696 id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
1713 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1697
1714 goto fail;
1715 spin_lock_irq(&pool->lock);
1716 }
1717 spin_unlock_irq(&pool->lock); 1698 spin_unlock_irq(&pool->lock);
1699 idr_preload_end();
1700 if (id < 0)
1701 goto fail;
1718 1702
1719 worker = alloc_worker(); 1703 worker = alloc_worker();
1720 if (!worker) 1704 if (!worker)
@@ -1723,40 +1707,46 @@ static struct worker *create_worker(struct worker_pool *pool)
1723 worker->pool = pool; 1707 worker->pool = pool;
1724 worker->id = id; 1708 worker->id = id;
1725 1709
1726 if (pool->cpu != WORK_CPU_UNBOUND) 1710 if (pool->cpu >= 0)
1727 worker->task = kthread_create_on_node(worker_thread, 1711 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
1728 worker, cpu_to_node(pool->cpu), 1712 pool->attrs->nice < 0 ? "H" : "");
1729 "kworker/%u:%d%s", pool->cpu, id, pri);
1730 else 1713 else
1731 worker->task = kthread_create(worker_thread, worker, 1714 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
1732 "kworker/u:%d%s", id, pri); 1715
1716 worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
1717 "kworker/%s", id_buf);
1733 if (IS_ERR(worker->task)) 1718 if (IS_ERR(worker->task))
1734 goto fail; 1719 goto fail;
1735 1720
1736 if (std_worker_pool_pri(pool)) 1721 /*
1737 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1722 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1723 * online CPUs. It'll be re-applied when any of the CPUs come up.
1724 */
1725 set_user_nice(worker->task, pool->attrs->nice);
1726 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1727
1728 /* prevent userland from meddling with cpumask of workqueue workers */
1729 worker->task->flags |= PF_NO_SETAFFINITY;
1738 1730
1739 /* 1731 /*
1740 * Determine CPU binding of the new worker depending on 1732 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1741 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the 1733 * remains stable across this function. See the comments above the
1742 * flag remains stable across this function. See the comments 1734 * flag definition for details.
1743 * above the flag definition for details.
1744 *
1745 * As an unbound worker may later become a regular one if CPU comes
1746 * online, make sure every worker has %PF_THREAD_BOUND set.
1747 */ 1735 */
1748 if (!(pool->flags & POOL_DISASSOCIATED)) { 1736 if (pool->flags & POOL_DISASSOCIATED)
1749 kthread_bind(worker->task, pool->cpu);
1750 } else {
1751 worker->task->flags |= PF_THREAD_BOUND;
1752 worker->flags |= WORKER_UNBOUND; 1737 worker->flags |= WORKER_UNBOUND;
1753 } 1738
1739 /* successful, commit the pointer to idr */
1740 spin_lock_irq(&pool->lock);
1741 idr_replace(&pool->worker_idr, worker, worker->id);
1742 spin_unlock_irq(&pool->lock);
1754 1743
1755 return worker; 1744 return worker;
1745
1756fail: 1746fail:
1757 if (id >= 0) { 1747 if (id >= 0) {
1758 spin_lock_irq(&pool->lock); 1748 spin_lock_irq(&pool->lock);
1759 ida_remove(&pool->worker_ida, id); 1749 idr_remove(&pool->worker_idr, id);
1760 spin_unlock_irq(&pool->lock); 1750 spin_unlock_irq(&pool->lock);
1761 } 1751 }
1762 kfree(worker); 1752 kfree(worker);
@@ -1781,6 +1771,30 @@ static void start_worker(struct worker *worker)
1781} 1771}
1782 1772
1783/** 1773/**
1774 * create_and_start_worker - create and start a worker for a pool
1775 * @pool: the target pool
1776 *
1777 * Grab the managership of @pool and create and start a new worker for it.
1778 */
1779static int create_and_start_worker(struct worker_pool *pool)
1780{
1781 struct worker *worker;
1782
1783 mutex_lock(&pool->manager_mutex);
1784
1785 worker = create_worker(pool);
1786 if (worker) {
1787 spin_lock_irq(&pool->lock);
1788 start_worker(worker);
1789 spin_unlock_irq(&pool->lock);
1790 }
1791
1792 mutex_unlock(&pool->manager_mutex);
1793
1794 return worker ? 0 : -ENOMEM;
1795}
1796
1797/**
1784 * destroy_worker - destroy a workqueue worker 1798 * destroy_worker - destroy a workqueue worker
1785 * @worker: worker to be destroyed 1799 * @worker: worker to be destroyed
1786 * 1800 *
@@ -1792,11 +1806,14 @@ static void start_worker(struct worker *worker)
1792static void destroy_worker(struct worker *worker) 1806static void destroy_worker(struct worker *worker)
1793{ 1807{
1794 struct worker_pool *pool = worker->pool; 1808 struct worker_pool *pool = worker->pool;
1795 int id = worker->id; 1809
1810 lockdep_assert_held(&pool->manager_mutex);
1811 lockdep_assert_held(&pool->lock);
1796 1812
1797 /* sanity check frenzy */ 1813 /* sanity check frenzy */
1798 BUG_ON(worker->current_work); 1814 if (WARN_ON(worker->current_work) ||
1799 BUG_ON(!list_empty(&worker->scheduled)); 1815 WARN_ON(!list_empty(&worker->scheduled)))
1816 return;
1800 1817
1801 if (worker->flags & WORKER_STARTED) 1818 if (worker->flags & WORKER_STARTED)
1802 pool->nr_workers--; 1819 pool->nr_workers--;
@@ -1806,13 +1823,14 @@ static void destroy_worker(struct worker *worker)
1806 list_del_init(&worker->entry); 1823 list_del_init(&worker->entry);
1807 worker->flags |= WORKER_DIE; 1824 worker->flags |= WORKER_DIE;
1808 1825
1826 idr_remove(&pool->worker_idr, worker->id);
1827
1809 spin_unlock_irq(&pool->lock); 1828 spin_unlock_irq(&pool->lock);
1810 1829
1811 kthread_stop(worker->task); 1830 kthread_stop(worker->task);
1812 kfree(worker); 1831 kfree(worker);
1813 1832
1814 spin_lock_irq(&pool->lock); 1833 spin_lock_irq(&pool->lock);
1815 ida_remove(&pool->worker_ida, id);
1816} 1834}
1817 1835
1818static void idle_worker_timeout(unsigned long __pool) 1836static void idle_worker_timeout(unsigned long __pool)
@@ -1841,23 +1859,21 @@ static void idle_worker_timeout(unsigned long __pool)
1841 spin_unlock_irq(&pool->lock); 1859 spin_unlock_irq(&pool->lock);
1842} 1860}
1843 1861
1844static bool send_mayday(struct work_struct *work) 1862static void send_mayday(struct work_struct *work)
1845{ 1863{
1846 struct pool_workqueue *pwq = get_work_pwq(work); 1864 struct pool_workqueue *pwq = get_work_pwq(work);
1847 struct workqueue_struct *wq = pwq->wq; 1865 struct workqueue_struct *wq = pwq->wq;
1848 unsigned int cpu;
1849 1866
1850 if (!(wq->flags & WQ_RESCUER)) 1867 lockdep_assert_held(&wq_mayday_lock);
1851 return false; 1868
1869 if (!wq->rescuer)
1870 return;
1852 1871
1853 /* mayday mayday mayday */ 1872 /* mayday mayday mayday */
1854 cpu = pwq->pool->cpu; 1873 if (list_empty(&pwq->mayday_node)) {
1855 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1874 list_add_tail(&pwq->mayday_node, &wq->maydays);
1856 if (cpu == WORK_CPU_UNBOUND)
1857 cpu = 0;
1858 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1859 wake_up_process(wq->rescuer->task); 1875 wake_up_process(wq->rescuer->task);
1860 return true; 1876 }
1861} 1877}
1862 1878
1863static void pool_mayday_timeout(unsigned long __pool) 1879static void pool_mayday_timeout(unsigned long __pool)
@@ -1865,7 +1881,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1865 struct worker_pool *pool = (void *)__pool; 1881 struct worker_pool *pool = (void *)__pool;
1866 struct work_struct *work; 1882 struct work_struct *work;
1867 1883
1868 spin_lock_irq(&pool->lock); 1884 spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
1885 spin_lock(&pool->lock);
1869 1886
1870 if (need_to_create_worker(pool)) { 1887 if (need_to_create_worker(pool)) {
1871 /* 1888 /*
@@ -1878,7 +1895,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1878 send_mayday(work); 1895 send_mayday(work);
1879 } 1896 }
1880 1897
1881 spin_unlock_irq(&pool->lock); 1898 spin_unlock(&pool->lock);
1899 spin_unlock_irq(&wq_mayday_lock);
1882 1900
1883 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1901 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1884} 1902}
@@ -1893,8 +1911,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1893 * sent to all rescuers with works scheduled on @pool to resolve 1911 * sent to all rescuers with works scheduled on @pool to resolve
1894 * possible allocation deadlock. 1912 * possible allocation deadlock.
1895 * 1913 *
1896 * On return, need_to_create_worker() is guaranteed to be false and 1914 * On return, need_to_create_worker() is guaranteed to be %false and
1897 * may_start_working() true. 1915 * may_start_working() %true.
1898 * 1916 *
1899 * LOCKING: 1917 * LOCKING:
1900 * spin_lock_irq(pool->lock) which may be released and regrabbed 1918 * spin_lock_irq(pool->lock) which may be released and regrabbed
@@ -1902,7 +1920,7 @@ static void pool_mayday_timeout(unsigned long __pool)
1902 * manager. 1920 * manager.
1903 * 1921 *
1904 * RETURNS: 1922 * RETURNS:
1905 * false if no action was taken and pool->lock stayed locked, true 1923 * %false if no action was taken and pool->lock stayed locked, %true
1906 * otherwise. 1924 * otherwise.
1907 */ 1925 */
1908static bool maybe_create_worker(struct worker_pool *pool) 1926static bool maybe_create_worker(struct worker_pool *pool)
@@ -1925,7 +1943,8 @@ restart:
1925 del_timer_sync(&pool->mayday_timer); 1943 del_timer_sync(&pool->mayday_timer);
1926 spin_lock_irq(&pool->lock); 1944 spin_lock_irq(&pool->lock);
1927 start_worker(worker); 1945 start_worker(worker);
1928 BUG_ON(need_to_create_worker(pool)); 1946 if (WARN_ON_ONCE(need_to_create_worker(pool)))
1947 goto restart;
1929 return true; 1948 return true;
1930 } 1949 }
1931 1950
@@ -1958,7 +1977,7 @@ restart:
1958 * multiple times. Called only from manager. 1977 * multiple times. Called only from manager.
1959 * 1978 *
1960 * RETURNS: 1979 * RETURNS:
1961 * false if no action was taken and pool->lock stayed locked, true 1980 * %false if no action was taken and pool->lock stayed locked, %true
1962 * otherwise. 1981 * otherwise.
1963 */ 1982 */
1964static bool maybe_destroy_workers(struct worker_pool *pool) 1983static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2009,42 +2028,38 @@ static bool manage_workers(struct worker *worker)
2009 struct worker_pool *pool = worker->pool; 2028 struct worker_pool *pool = worker->pool;
2010 bool ret = false; 2029 bool ret = false;
2011 2030
2012 if (pool->flags & POOL_MANAGING_WORKERS) 2031 /*
2032 * Managership is governed by two mutexes - manager_arb and
2033 * manager_mutex. manager_arb handles arbitration of manager role.
2034 * Anyone who successfully grabs manager_arb wins the arbitration
2035 * and becomes the manager. mutex_trylock() on pool->manager_arb
2036 * failure while holding pool->lock reliably indicates that someone
2037 * else is managing the pool and the worker which failed trylock
2038 * can proceed to executing work items. This means that anyone
2039 * grabbing manager_arb is responsible for actually performing
2040 * manager duties. If manager_arb is grabbed and released without
2041 * actual management, the pool may stall indefinitely.
2042 *
2043 * manager_mutex is used for exclusion of actual management
2044 * operations. The holder of manager_mutex can be sure that none
2045 * of management operations, including creation and destruction of
2046 * workers, won't take place until the mutex is released. Because
2047 * manager_mutex doesn't interfere with manager role arbitration,
2048 * it is guaranteed that the pool's management, while may be
2049 * delayed, won't be disturbed by someone else grabbing
2050 * manager_mutex.
2051 */
2052 if (!mutex_trylock(&pool->manager_arb))
2013 return ret; 2053 return ret;
2014 2054
2015 pool->flags |= POOL_MANAGING_WORKERS;
2016
2017 /* 2055 /*
2018 * To simplify both worker management and CPU hotplug, hold off 2056 * With manager arbitration won, manager_mutex would be free in
2019 * management while hotplug is in progress. CPU hotplug path can't 2057 * most cases. trylock first without dropping @pool->lock.
2020 * grab %POOL_MANAGING_WORKERS to achieve this because that can
2021 * lead to idle worker depletion (all become busy thinking someone
2022 * else is managing) which in turn can result in deadlock under
2023 * extreme circumstances. Use @pool->assoc_mutex to synchronize
2024 * manager against CPU hotplug.
2025 *
2026 * assoc_mutex would always be free unless CPU hotplug is in
2027 * progress. trylock first without dropping @pool->lock.
2028 */ 2058 */
2029 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2059 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2030 spin_unlock_irq(&pool->lock); 2060 spin_unlock_irq(&pool->lock);
2031 mutex_lock(&pool->assoc_mutex); 2061 mutex_lock(&pool->manager_mutex);
2032 /* 2062 spin_lock_irq(&pool->lock);
2033 * CPU hotplug could have happened while we were waiting
2034 * for assoc_mutex. Hotplug itself can't handle us
2035 * because manager isn't either on idle or busy list, and
2036 * @pool's state and ours could have deviated.
2037 *
2038 * As hotplug is now excluded via assoc_mutex, we can
2039 * simply try to bind. It will succeed or fail depending
2040 * on @pool's current state. Try it and adjust
2041 * %WORKER_UNBOUND accordingly.
2042 */
2043 if (worker_maybe_bind_and_lock(worker))
2044 worker->flags &= ~WORKER_UNBOUND;
2045 else
2046 worker->flags |= WORKER_UNBOUND;
2047
2048 ret = true; 2063 ret = true;
2049 } 2064 }
2050 2065
@@ -2057,8 +2072,8 @@ static bool manage_workers(struct worker *worker)
2057 ret |= maybe_destroy_workers(pool); 2072 ret |= maybe_destroy_workers(pool);
2058 ret |= maybe_create_worker(pool); 2073 ret |= maybe_create_worker(pool);
2059 2074
2060 pool->flags &= ~POOL_MANAGING_WORKERS; 2075 mutex_unlock(&pool->manager_mutex);
2061 mutex_unlock(&pool->assoc_mutex); 2076 mutex_unlock(&pool->manager_arb);
2062 return ret; 2077 return ret;
2063} 2078}
2064 2079
@@ -2184,6 +2199,7 @@ __acquires(&pool->lock)
2184 worker->current_work = NULL; 2199 worker->current_work = NULL;
2185 worker->current_func = NULL; 2200 worker->current_func = NULL;
2186 worker->current_pwq = NULL; 2201 worker->current_pwq = NULL;
2202 worker->desc_valid = false;
2187 pwq_dec_nr_in_flight(pwq, work_color); 2203 pwq_dec_nr_in_flight(pwq, work_color);
2188} 2204}
2189 2205
@@ -2212,11 +2228,11 @@ static void process_scheduled_works(struct worker *worker)
2212 * worker_thread - the worker thread function 2228 * worker_thread - the worker thread function
2213 * @__worker: self 2229 * @__worker: self
2214 * 2230 *
2215 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools 2231 * The worker thread function. All workers belong to a worker_pool -
2216 * of these per each cpu. These workers process all works regardless of 2232 * either a per-cpu one or dynamic unbound one. These workers process all
2217 * their specific target workqueue. The only exception is works which 2233 * work items regardless of their specific target workqueue. The only
2218 * belong to workqueues with a rescuer which will be explained in 2234 * exception is work items which belong to workqueues with a rescuer which
2219 * rescuer_thread(). 2235 * will be explained in rescuer_thread().
2220 */ 2236 */
2221static int worker_thread(void *__worker) 2237static int worker_thread(void *__worker)
2222{ 2238{
@@ -2228,19 +2244,12 @@ static int worker_thread(void *__worker)
2228woke_up: 2244woke_up:
2229 spin_lock_irq(&pool->lock); 2245 spin_lock_irq(&pool->lock);
2230 2246
2231 /* we are off idle list if destruction or rebind is requested */ 2247 /* am I supposed to die? */
2232 if (unlikely(list_empty(&worker->entry))) { 2248 if (unlikely(worker->flags & WORKER_DIE)) {
2233 spin_unlock_irq(&pool->lock); 2249 spin_unlock_irq(&pool->lock);
2234 2250 WARN_ON_ONCE(!list_empty(&worker->entry));
2235 /* if DIE is set, destruction is requested */ 2251 worker->task->flags &= ~PF_WQ_WORKER;
2236 if (worker->flags & WORKER_DIE) { 2252 return 0;
2237 worker->task->flags &= ~PF_WQ_WORKER;
2238 return 0;
2239 }
2240
2241 /* otherwise, rebind */
2242 idle_worker_rebind(worker);
2243 goto woke_up;
2244 } 2253 }
2245 2254
2246 worker_leave_idle(worker); 2255 worker_leave_idle(worker);
@@ -2258,14 +2267,16 @@ recheck:
2258 * preparing to process a work or actually processing it. 2267 * preparing to process a work or actually processing it.
2259 * Make sure nobody diddled with it while I was sleeping. 2268 * Make sure nobody diddled with it while I was sleeping.
2260 */ 2269 */
2261 BUG_ON(!list_empty(&worker->scheduled)); 2270 WARN_ON_ONCE(!list_empty(&worker->scheduled));
2262 2271
2263 /* 2272 /*
2264 * When control reaches this point, we're guaranteed to have 2273 * Finish PREP stage. We're guaranteed to have at least one idle
2265 * at least one idle worker or that someone else has already 2274 * worker or that someone else has already assumed the manager
2266 * assumed the manager role. 2275 * role. This is where @worker starts participating in concurrency
2276 * management if applicable and concurrency management is restored
2277 * after being rebound. See rebind_workers() for details.
2267 */ 2278 */
2268 worker_clr_flags(worker, WORKER_PREP); 2279 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
2269 2280
2270 do { 2281 do {
2271 struct work_struct *work = 2282 struct work_struct *work =
@@ -2307,7 +2318,7 @@ sleep:
2307 * @__rescuer: self 2318 * @__rescuer: self
2308 * 2319 *
2309 * Workqueue rescuer thread function. There's one rescuer for each 2320 * Workqueue rescuer thread function. There's one rescuer for each
2310 * workqueue which has WQ_RESCUER set. 2321 * workqueue which has WQ_MEM_RECLAIM set.
2311 * 2322 *
2312 * Regular work processing on a pool may block trying to create a new 2323 * Regular work processing on a pool may block trying to create a new
2313 * worker which uses GFP_KERNEL allocation which has slight chance of 2324 * worker which uses GFP_KERNEL allocation which has slight chance of
@@ -2326,8 +2337,6 @@ static int rescuer_thread(void *__rescuer)
2326 struct worker *rescuer = __rescuer; 2337 struct worker *rescuer = __rescuer;
2327 struct workqueue_struct *wq = rescuer->rescue_wq; 2338 struct workqueue_struct *wq = rescuer->rescue_wq;
2328 struct list_head *scheduled = &rescuer->scheduled; 2339 struct list_head *scheduled = &rescuer->scheduled;
2329 bool is_unbound = wq->flags & WQ_UNBOUND;
2330 unsigned int cpu;
2331 2340
2332 set_user_nice(current, RESCUER_NICE_LEVEL); 2341 set_user_nice(current, RESCUER_NICE_LEVEL);
2333 2342
@@ -2345,28 +2354,29 @@ repeat:
2345 return 0; 2354 return 0;
2346 } 2355 }
2347 2356
2348 /* 2357 /* see whether any pwq is asking for help */
2349 * See whether any cpu is asking for help. Unbounded 2358 spin_lock_irq(&wq_mayday_lock);
2350 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. 2359
2351 */ 2360 while (!list_empty(&wq->maydays)) {
2352 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2361 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
2353 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2362 struct pool_workqueue, mayday_node);
2354 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2355 struct worker_pool *pool = pwq->pool; 2363 struct worker_pool *pool = pwq->pool;
2356 struct work_struct *work, *n; 2364 struct work_struct *work, *n;
2357 2365
2358 __set_current_state(TASK_RUNNING); 2366 __set_current_state(TASK_RUNNING);
2359 mayday_clear_cpu(cpu, wq->mayday_mask); 2367 list_del_init(&pwq->mayday_node);
2368
2369 spin_unlock_irq(&wq_mayday_lock);
2360 2370
2361 /* migrate to the target cpu if possible */ 2371 /* migrate to the target cpu if possible */
2372 worker_maybe_bind_and_lock(pool);
2362 rescuer->pool = pool; 2373 rescuer->pool = pool;
2363 worker_maybe_bind_and_lock(rescuer);
2364 2374
2365 /* 2375 /*
2366 * Slurp in all works issued via this workqueue and 2376 * Slurp in all works issued via this workqueue and
2367 * process'em. 2377 * process'em.
2368 */ 2378 */
2369 BUG_ON(!list_empty(&rescuer->scheduled)); 2379 WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
2370 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2380 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2371 if (get_work_pwq(work) == pwq) 2381 if (get_work_pwq(work) == pwq)
2372 move_linked_works(work, scheduled, &n); 2382 move_linked_works(work, scheduled, &n);
@@ -2381,9 +2391,13 @@ repeat:
2381 if (keep_working(pool)) 2391 if (keep_working(pool))
2382 wake_up_worker(pool); 2392 wake_up_worker(pool);
2383 2393
2384 spin_unlock_irq(&pool->lock); 2394 rescuer->pool = NULL;
2395 spin_unlock(&pool->lock);
2396 spin_lock(&wq_mayday_lock);
2385 } 2397 }
2386 2398
2399 spin_unlock_irq(&wq_mayday_lock);
2400
2387 /* rescuers should never participate in concurrency management */ 2401 /* rescuers should never participate in concurrency management */
2388 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2402 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2389 schedule(); 2403 schedule();
@@ -2487,7 +2501,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2487 * advanced to @work_color. 2501 * advanced to @work_color.
2488 * 2502 *
2489 * CONTEXT: 2503 * CONTEXT:
2490 * mutex_lock(wq->flush_mutex). 2504 * mutex_lock(wq->mutex).
2491 * 2505 *
2492 * RETURNS: 2506 * RETURNS:
2493 * %true if @flush_color >= 0 and there's something to flush. %false 2507 * %true if @flush_color >= 0 and there's something to flush. %false
@@ -2497,21 +2511,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2497 int flush_color, int work_color) 2511 int flush_color, int work_color)
2498{ 2512{
2499 bool wait = false; 2513 bool wait = false;
2500 unsigned int cpu; 2514 struct pool_workqueue *pwq;
2501 2515
2502 if (flush_color >= 0) { 2516 if (flush_color >= 0) {
2503 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); 2517 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
2504 atomic_set(&wq->nr_pwqs_to_flush, 1); 2518 atomic_set(&wq->nr_pwqs_to_flush, 1);
2505 } 2519 }
2506 2520
2507 for_each_pwq_cpu(cpu, wq) { 2521 for_each_pwq(pwq, wq) {
2508 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2509 struct worker_pool *pool = pwq->pool; 2522 struct worker_pool *pool = pwq->pool;
2510 2523
2511 spin_lock_irq(&pool->lock); 2524 spin_lock_irq(&pool->lock);
2512 2525
2513 if (flush_color >= 0) { 2526 if (flush_color >= 0) {
2514 BUG_ON(pwq->flush_color != -1); 2527 WARN_ON_ONCE(pwq->flush_color != -1);
2515 2528
2516 if (pwq->nr_in_flight[flush_color]) { 2529 if (pwq->nr_in_flight[flush_color]) {
2517 pwq->flush_color = flush_color; 2530 pwq->flush_color = flush_color;
@@ -2521,7 +2534,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2521 } 2534 }
2522 2535
2523 if (work_color >= 0) { 2536 if (work_color >= 0) {
2524 BUG_ON(work_color != work_next_color(pwq->work_color)); 2537 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
2525 pwq->work_color = work_color; 2538 pwq->work_color = work_color;
2526 } 2539 }
2527 2540
@@ -2538,11 +2551,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2538 * flush_workqueue - ensure that any scheduled work has run to completion. 2551 * flush_workqueue - ensure that any scheduled work has run to completion.
2539 * @wq: workqueue to flush 2552 * @wq: workqueue to flush
2540 * 2553 *
2541 * Forces execution of the workqueue and blocks until its completion. 2554 * This function sleeps until all work items which were queued on entry
2542 * This is typically used in driver shutdown handlers. 2555 * have finished execution, but it is not livelocked by new incoming ones.
2543 *
2544 * We sleep until all works which were queued on entry have been handled,
2545 * but we are not livelocked by new incoming ones.
2546 */ 2556 */
2547void flush_workqueue(struct workqueue_struct *wq) 2557void flush_workqueue(struct workqueue_struct *wq)
2548{ 2558{
@@ -2556,7 +2566,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2556 lock_map_acquire(&wq->lockdep_map); 2566 lock_map_acquire(&wq->lockdep_map);
2557 lock_map_release(&wq->lockdep_map); 2567 lock_map_release(&wq->lockdep_map);
2558 2568
2559 mutex_lock(&wq->flush_mutex); 2569 mutex_lock(&wq->mutex);
2560 2570
2561 /* 2571 /*
2562 * Start-to-wait phase 2572 * Start-to-wait phase
@@ -2569,13 +2579,13 @@ void flush_workqueue(struct workqueue_struct *wq)
2569 * becomes our flush_color and work_color is advanced 2579 * becomes our flush_color and work_color is advanced
2570 * by one. 2580 * by one.
2571 */ 2581 */
2572 BUG_ON(!list_empty(&wq->flusher_overflow)); 2582 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
2573 this_flusher.flush_color = wq->work_color; 2583 this_flusher.flush_color = wq->work_color;
2574 wq->work_color = next_color; 2584 wq->work_color = next_color;
2575 2585
2576 if (!wq->first_flusher) { 2586 if (!wq->first_flusher) {
2577 /* no flush in progress, become the first flusher */ 2587 /* no flush in progress, become the first flusher */
2578 BUG_ON(wq->flush_color != this_flusher.flush_color); 2588 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2579 2589
2580 wq->first_flusher = &this_flusher; 2590 wq->first_flusher = &this_flusher;
2581 2591
@@ -2588,7 +2598,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2588 } 2598 }
2589 } else { 2599 } else {
2590 /* wait in queue */ 2600 /* wait in queue */
2591 BUG_ON(wq->flush_color == this_flusher.flush_color); 2601 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
2592 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2602 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2593 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 2603 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2594 } 2604 }
@@ -2601,7 +2611,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2601 list_add_tail(&this_flusher.list, &wq->flusher_overflow); 2611 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2602 } 2612 }
2603 2613
2604 mutex_unlock(&wq->flush_mutex); 2614 mutex_unlock(&wq->mutex);
2605 2615
2606 wait_for_completion(&this_flusher.done); 2616 wait_for_completion(&this_flusher.done);
2607 2617
@@ -2614,7 +2624,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2614 if (wq->first_flusher != &this_flusher) 2624 if (wq->first_flusher != &this_flusher)
2615 return; 2625 return;
2616 2626
2617 mutex_lock(&wq->flush_mutex); 2627 mutex_lock(&wq->mutex);
2618 2628
2619 /* we might have raced, check again with mutex held */ 2629 /* we might have raced, check again with mutex held */
2620 if (wq->first_flusher != &this_flusher) 2630 if (wq->first_flusher != &this_flusher)
@@ -2622,8 +2632,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2622 2632
2623 wq->first_flusher = NULL; 2633 wq->first_flusher = NULL;
2624 2634
2625 BUG_ON(!list_empty(&this_flusher.list)); 2635 WARN_ON_ONCE(!list_empty(&this_flusher.list));
2626 BUG_ON(wq->flush_color != this_flusher.flush_color); 2636 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2627 2637
2628 while (true) { 2638 while (true) {
2629 struct wq_flusher *next, *tmp; 2639 struct wq_flusher *next, *tmp;
@@ -2636,8 +2646,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2636 complete(&next->done); 2646 complete(&next->done);
2637 } 2647 }
2638 2648
2639 BUG_ON(!list_empty(&wq->flusher_overflow) && 2649 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
2640 wq->flush_color != work_next_color(wq->work_color)); 2650 wq->flush_color != work_next_color(wq->work_color));
2641 2651
2642 /* this flush_color is finished, advance by one */ 2652 /* this flush_color is finished, advance by one */
2643 wq->flush_color = work_next_color(wq->flush_color); 2653 wq->flush_color = work_next_color(wq->flush_color);
@@ -2661,7 +2671,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2661 } 2671 }
2662 2672
2663 if (list_empty(&wq->flusher_queue)) { 2673 if (list_empty(&wq->flusher_queue)) {
2664 BUG_ON(wq->flush_color != wq->work_color); 2674 WARN_ON_ONCE(wq->flush_color != wq->work_color);
2665 break; 2675 break;
2666 } 2676 }
2667 2677
@@ -2669,8 +2679,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2669 * Need to flush more colors. Make the next flusher 2679 * Need to flush more colors. Make the next flusher
2670 * the new first flusher and arm pwqs. 2680 * the new first flusher and arm pwqs.
2671 */ 2681 */
2672 BUG_ON(wq->flush_color == wq->work_color); 2682 WARN_ON_ONCE(wq->flush_color == wq->work_color);
2673 BUG_ON(wq->flush_color != next->flush_color); 2683 WARN_ON_ONCE(wq->flush_color != next->flush_color);
2674 2684
2675 list_del_init(&next->list); 2685 list_del_init(&next->list);
2676 wq->first_flusher = next; 2686 wq->first_flusher = next;
@@ -2686,7 +2696,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2686 } 2696 }
2687 2697
2688out_unlock: 2698out_unlock:
2689 mutex_unlock(&wq->flush_mutex); 2699 mutex_unlock(&wq->mutex);
2690} 2700}
2691EXPORT_SYMBOL_GPL(flush_workqueue); 2701EXPORT_SYMBOL_GPL(flush_workqueue);
2692 2702
@@ -2704,22 +2714,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
2704void drain_workqueue(struct workqueue_struct *wq) 2714void drain_workqueue(struct workqueue_struct *wq)
2705{ 2715{
2706 unsigned int flush_cnt = 0; 2716 unsigned int flush_cnt = 0;
2707 unsigned int cpu; 2717 struct pool_workqueue *pwq;
2708 2718
2709 /* 2719 /*
2710 * __queue_work() needs to test whether there are drainers, is much 2720 * __queue_work() needs to test whether there are drainers, is much
2711 * hotter than drain_workqueue() and already looks at @wq->flags. 2721 * hotter than drain_workqueue() and already looks at @wq->flags.
2712 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. 2722 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
2713 */ 2723 */
2714 spin_lock(&workqueue_lock); 2724 mutex_lock(&wq->mutex);
2715 if (!wq->nr_drainers++) 2725 if (!wq->nr_drainers++)
2716 wq->flags |= WQ_DRAINING; 2726 wq->flags |= __WQ_DRAINING;
2717 spin_unlock(&workqueue_lock); 2727 mutex_unlock(&wq->mutex);
2718reflush: 2728reflush:
2719 flush_workqueue(wq); 2729 flush_workqueue(wq);
2720 2730
2721 for_each_pwq_cpu(cpu, wq) { 2731 mutex_lock(&wq->mutex);
2722 struct pool_workqueue *pwq = get_pwq(cpu, wq); 2732
2733 for_each_pwq(pwq, wq) {
2723 bool drained; 2734 bool drained;
2724 2735
2725 spin_lock_irq(&pwq->pool->lock); 2736 spin_lock_irq(&pwq->pool->lock);
@@ -2731,15 +2742,16 @@ reflush:
2731 2742
2732 if (++flush_cnt == 10 || 2743 if (++flush_cnt == 10 ||
2733 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 2744 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2734 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", 2745 pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
2735 wq->name, flush_cnt); 2746 wq->name, flush_cnt);
2747
2748 mutex_unlock(&wq->mutex);
2736 goto reflush; 2749 goto reflush;
2737 } 2750 }
2738 2751
2739 spin_lock(&workqueue_lock);
2740 if (!--wq->nr_drainers) 2752 if (!--wq->nr_drainers)
2741 wq->flags &= ~WQ_DRAINING; 2753 wq->flags &= ~__WQ_DRAINING;
2742 spin_unlock(&workqueue_lock); 2754 mutex_unlock(&wq->mutex);
2743} 2755}
2744EXPORT_SYMBOL_GPL(drain_workqueue); 2756EXPORT_SYMBOL_GPL(drain_workqueue);
2745 2757
@@ -2750,11 +2762,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2750 struct pool_workqueue *pwq; 2762 struct pool_workqueue *pwq;
2751 2763
2752 might_sleep(); 2764 might_sleep();
2765
2766 local_irq_disable();
2753 pool = get_work_pool(work); 2767 pool = get_work_pool(work);
2754 if (!pool) 2768 if (!pool) {
2769 local_irq_enable();
2755 return false; 2770 return false;
2771 }
2756 2772
2757 spin_lock_irq(&pool->lock); 2773 spin_lock(&pool->lock);
2758 /* see the comment in try_to_grab_pending() with the same code */ 2774 /* see the comment in try_to_grab_pending() with the same code */
2759 pwq = get_work_pwq(work); 2775 pwq = get_work_pwq(work);
2760 if (pwq) { 2776 if (pwq) {
@@ -2776,7 +2792,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2776 * flusher is not running on the same workqueue by verifying write 2792 * flusher is not running on the same workqueue by verifying write
2777 * access. 2793 * access.
2778 */ 2794 */
2779 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) 2795 if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
2780 lock_map_acquire(&pwq->wq->lockdep_map); 2796 lock_map_acquire(&pwq->wq->lockdep_map);
2781 else 2797 else
2782 lock_map_acquire_read(&pwq->wq->lockdep_map); 2798 lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -2933,66 +2949,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
2933EXPORT_SYMBOL(cancel_delayed_work_sync); 2949EXPORT_SYMBOL(cancel_delayed_work_sync);
2934 2950
2935/** 2951/**
2936 * schedule_work_on - put work task on a specific cpu
2937 * @cpu: cpu to put the work task on
2938 * @work: job to be done
2939 *
2940 * This puts a job on a specific cpu
2941 */
2942bool schedule_work_on(int cpu, struct work_struct *work)
2943{
2944 return queue_work_on(cpu, system_wq, work);
2945}
2946EXPORT_SYMBOL(schedule_work_on);
2947
2948/**
2949 * schedule_work - put work task in global workqueue
2950 * @work: job to be done
2951 *
2952 * Returns %false if @work was already on the kernel-global workqueue and
2953 * %true otherwise.
2954 *
2955 * This puts a job in the kernel-global workqueue if it was not already
2956 * queued and leaves it in the same position on the kernel-global
2957 * workqueue otherwise.
2958 */
2959bool schedule_work(struct work_struct *work)
2960{
2961 return queue_work(system_wq, work);
2962}
2963EXPORT_SYMBOL(schedule_work);
2964
2965/**
2966 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2967 * @cpu: cpu to use
2968 * @dwork: job to be done
2969 * @delay: number of jiffies to wait
2970 *
2971 * After waiting for a given time this puts a job in the kernel-global
2972 * workqueue on the specified CPU.
2973 */
2974bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2975 unsigned long delay)
2976{
2977 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2978}
2979EXPORT_SYMBOL(schedule_delayed_work_on);
2980
2981/**
2982 * schedule_delayed_work - put work task in global workqueue after delay
2983 * @dwork: job to be done
2984 * @delay: number of jiffies to wait or 0 for immediate execution
2985 *
2986 * After waiting for a given time this puts a job in the kernel-global
2987 * workqueue.
2988 */
2989bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
2990{
2991 return queue_delayed_work(system_wq, dwork, delay);
2992}
2993EXPORT_SYMBOL(schedule_delayed_work);
2994
2995/**
2996 * schedule_on_each_cpu - execute a function synchronously on each online CPU 2952 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2997 * @func: the function to call 2953 * @func: the function to call
2998 * 2954 *
@@ -3085,51 +3041,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
3085} 3041}
3086EXPORT_SYMBOL_GPL(execute_in_process_context); 3042EXPORT_SYMBOL_GPL(execute_in_process_context);
3087 3043
3088int keventd_up(void) 3044#ifdef CONFIG_SYSFS
3045/*
3046 * Workqueues with WQ_SYSFS flag set is visible to userland via
3047 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
3048 * following attributes.
3049 *
3050 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
3051 * max_active RW int : maximum number of in-flight work items
3052 *
3053 * Unbound workqueues have the following extra attributes.
3054 *
3055 * id RO int : the associated pool ID
3056 * nice RW int : nice value of the workers
3057 * cpumask RW mask : bitmask of allowed CPUs for the workers
3058 */
3059struct wq_device {
3060 struct workqueue_struct *wq;
3061 struct device dev;
3062};
3063
3064static struct workqueue_struct *dev_to_wq(struct device *dev)
3065{
3066 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3067
3068 return wq_dev->wq;
3069}
3070
3071static ssize_t wq_per_cpu_show(struct device *dev,
3072 struct device_attribute *attr, char *buf)
3073{
3074 struct workqueue_struct *wq = dev_to_wq(dev);
3075
3076 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3077}
3078
3079static ssize_t wq_max_active_show(struct device *dev,
3080 struct device_attribute *attr, char *buf)
3089{ 3081{
3090 return system_wq != NULL; 3082 struct workqueue_struct *wq = dev_to_wq(dev);
3083
3084 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3091} 3085}
3092 3086
3093static int alloc_pwqs(struct workqueue_struct *wq) 3087static ssize_t wq_max_active_store(struct device *dev,
3088 struct device_attribute *attr,
3089 const char *buf, size_t count)
3094{ 3090{
3091 struct workqueue_struct *wq = dev_to_wq(dev);
3092 int val;
3093
3094 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
3095 return -EINVAL;
3096
3097 workqueue_set_max_active(wq, val);
3098 return count;
3099}
3100
3101static struct device_attribute wq_sysfs_attrs[] = {
3102 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
3103 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
3104 __ATTR_NULL,
3105};
3106
3107static ssize_t wq_pool_ids_show(struct device *dev,
3108 struct device_attribute *attr, char *buf)
3109{
3110 struct workqueue_struct *wq = dev_to_wq(dev);
3111 const char *delim = "";
3112 int node, written = 0;
3113
3114 rcu_read_lock_sched();
3115 for_each_node(node) {
3116 written += scnprintf(buf + written, PAGE_SIZE - written,
3117 "%s%d:%d", delim, node,
3118 unbound_pwq_by_node(wq, node)->pool->id);
3119 delim = " ";
3120 }
3121 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3122 rcu_read_unlock_sched();
3123
3124 return written;
3125}
3126
3127static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
3128 char *buf)
3129{
3130 struct workqueue_struct *wq = dev_to_wq(dev);
3131 int written;
3132
3133 mutex_lock(&wq->mutex);
3134 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
3135 mutex_unlock(&wq->mutex);
3136
3137 return written;
3138}
3139
3140/* prepare workqueue_attrs for sysfs store operations */
3141static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
3142{
3143 struct workqueue_attrs *attrs;
3144
3145 attrs = alloc_workqueue_attrs(GFP_KERNEL);
3146 if (!attrs)
3147 return NULL;
3148
3149 mutex_lock(&wq->mutex);
3150 copy_workqueue_attrs(attrs, wq->unbound_attrs);
3151 mutex_unlock(&wq->mutex);
3152 return attrs;
3153}
3154
3155static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3156 const char *buf, size_t count)
3157{
3158 struct workqueue_struct *wq = dev_to_wq(dev);
3159 struct workqueue_attrs *attrs;
3160 int ret;
3161
3162 attrs = wq_sysfs_prep_attrs(wq);
3163 if (!attrs)
3164 return -ENOMEM;
3165
3166 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3167 attrs->nice >= -20 && attrs->nice <= 19)
3168 ret = apply_workqueue_attrs(wq, attrs);
3169 else
3170 ret = -EINVAL;
3171
3172 free_workqueue_attrs(attrs);
3173 return ret ?: count;
3174}
3175
3176static ssize_t wq_cpumask_show(struct device *dev,
3177 struct device_attribute *attr, char *buf)
3178{
3179 struct workqueue_struct *wq = dev_to_wq(dev);
3180 int written;
3181
3182 mutex_lock(&wq->mutex);
3183 written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
3184 mutex_unlock(&wq->mutex);
3185
3186 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3187 return written;
3188}
3189
3190static ssize_t wq_cpumask_store(struct device *dev,
3191 struct device_attribute *attr,
3192 const char *buf, size_t count)
3193{
3194 struct workqueue_struct *wq = dev_to_wq(dev);
3195 struct workqueue_attrs *attrs;
3196 int ret;
3197
3198 attrs = wq_sysfs_prep_attrs(wq);
3199 if (!attrs)
3200 return -ENOMEM;
3201
3202 ret = cpumask_parse(buf, attrs->cpumask);
3203 if (!ret)
3204 ret = apply_workqueue_attrs(wq, attrs);
3205
3206 free_workqueue_attrs(attrs);
3207 return ret ?: count;
3208}
3209
3210static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
3211 char *buf)
3212{
3213 struct workqueue_struct *wq = dev_to_wq(dev);
3214 int written;
3215
3216 mutex_lock(&wq->mutex);
3217 written = scnprintf(buf, PAGE_SIZE, "%d\n",
3218 !wq->unbound_attrs->no_numa);
3219 mutex_unlock(&wq->mutex);
3220
3221 return written;
3222}
3223
3224static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
3225 const char *buf, size_t count)
3226{
3227 struct workqueue_struct *wq = dev_to_wq(dev);
3228 struct workqueue_attrs *attrs;
3229 int v, ret;
3230
3231 attrs = wq_sysfs_prep_attrs(wq);
3232 if (!attrs)
3233 return -ENOMEM;
3234
3235 ret = -EINVAL;
3236 if (sscanf(buf, "%d", &v) == 1) {
3237 attrs->no_numa = !v;
3238 ret = apply_workqueue_attrs(wq, attrs);
3239 }
3240
3241 free_workqueue_attrs(attrs);
3242 return ret ?: count;
3243}
3244
3245static struct device_attribute wq_sysfs_unbound_attrs[] = {
3246 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3247 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3248 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
3249 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3250 __ATTR_NULL,
3251};
3252
3253static struct bus_type wq_subsys = {
3254 .name = "workqueue",
3255 .dev_attrs = wq_sysfs_attrs,
3256};
3257
3258static int __init wq_sysfs_init(void)
3259{
3260 return subsys_virtual_register(&wq_subsys, NULL);
3261}
3262core_initcall(wq_sysfs_init);
3263
3264static void wq_device_release(struct device *dev)
3265{
3266 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3267
3268 kfree(wq_dev);
3269}
3270
3271/**
3272 * workqueue_sysfs_register - make a workqueue visible in sysfs
3273 * @wq: the workqueue to register
3274 *
3275 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
3276 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
3277 * which is the preferred method.
3278 *
3279 * Workqueue user should use this function directly iff it wants to apply
3280 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
3281 * apply_workqueue_attrs() may race against userland updating the
3282 * attributes.
3283 *
3284 * Returns 0 on success, -errno on failure.
3285 */
3286int workqueue_sysfs_register(struct workqueue_struct *wq)
3287{
3288 struct wq_device *wq_dev;
3289 int ret;
3290
3095 /* 3291 /*
3096 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3292 * Adjusting max_active or creating new pwqs by applyting
3097 * Make sure that the alignment isn't lower than that of 3293 * attributes breaks ordering guarantee. Disallow exposing ordered
3098 * unsigned long long. 3294 * workqueues.
3099 */ 3295 */
3100 const size_t size = sizeof(struct pool_workqueue); 3296 if (WARN_ON(wq->flags & __WQ_ORDERED))
3101 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3297 return -EINVAL;
3102 __alignof__(unsigned long long));
3103 3298
3104 if (!(wq->flags & WQ_UNBOUND)) 3299 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
3105 wq->pool_wq.pcpu = __alloc_percpu(size, align); 3300 if (!wq_dev)
3106 else { 3301 return -ENOMEM;
3107 void *ptr; 3302
3303 wq_dev->wq = wq;
3304 wq_dev->dev.bus = &wq_subsys;
3305 wq_dev->dev.init_name = wq->name;
3306 wq_dev->dev.release = wq_device_release;
3307
3308 /*
3309 * unbound_attrs are created separately. Suppress uevent until
3310 * everything is ready.
3311 */
3312 dev_set_uevent_suppress(&wq_dev->dev, true);
3313
3314 ret = device_register(&wq_dev->dev);
3315 if (ret) {
3316 kfree(wq_dev);
3317 wq->wq_dev = NULL;
3318 return ret;
3319 }
3320
3321 if (wq->flags & WQ_UNBOUND) {
3322 struct device_attribute *attr;
3323
3324 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
3325 ret = device_create_file(&wq_dev->dev, attr);
3326 if (ret) {
3327 device_unregister(&wq_dev->dev);
3328 wq->wq_dev = NULL;
3329 return ret;
3330 }
3331 }
3332 }
3333
3334 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3335 return 0;
3336}
3337
3338/**
3339 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
3340 * @wq: the workqueue to unregister
3341 *
3342 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
3343 */
3344static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
3345{
3346 struct wq_device *wq_dev = wq->wq_dev;
3347
3348 if (!wq->wq_dev)
3349 return;
3350
3351 wq->wq_dev = NULL;
3352 device_unregister(&wq_dev->dev);
3353}
3354#else /* CONFIG_SYSFS */
3355static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
3356#endif /* CONFIG_SYSFS */
3357
3358/**
3359 * free_workqueue_attrs - free a workqueue_attrs
3360 * @attrs: workqueue_attrs to free
3361 *
3362 * Undo alloc_workqueue_attrs().
3363 */
3364void free_workqueue_attrs(struct workqueue_attrs *attrs)
3365{
3366 if (attrs) {
3367 free_cpumask_var(attrs->cpumask);
3368 kfree(attrs);
3369 }
3370}
3371
3372/**
3373 * alloc_workqueue_attrs - allocate a workqueue_attrs
3374 * @gfp_mask: allocation mask to use
3375 *
3376 * Allocate a new workqueue_attrs, initialize with default settings and
3377 * return it. Returns NULL on failure.
3378 */
3379struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
3380{
3381 struct workqueue_attrs *attrs;
3382
3383 attrs = kzalloc(sizeof(*attrs), gfp_mask);
3384 if (!attrs)
3385 goto fail;
3386 if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
3387 goto fail;
3388
3389 cpumask_copy(attrs->cpumask, cpu_possible_mask);
3390 return attrs;
3391fail:
3392 free_workqueue_attrs(attrs);
3393 return NULL;
3394}
3395
3396static void copy_workqueue_attrs(struct workqueue_attrs *to,
3397 const struct workqueue_attrs *from)
3398{
3399 to->nice = from->nice;
3400 cpumask_copy(to->cpumask, from->cpumask);
3401}
3402
3403/* hash value of the content of @attr */
3404static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
3405{
3406 u32 hash = 0;
3407
3408 hash = jhash_1word(attrs->nice, hash);
3409 hash = jhash(cpumask_bits(attrs->cpumask),
3410 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
3411 return hash;
3412}
3413
3414/* content equality test */
3415static bool wqattrs_equal(const struct workqueue_attrs *a,
3416 const struct workqueue_attrs *b)
3417{
3418 if (a->nice != b->nice)
3419 return false;
3420 if (!cpumask_equal(a->cpumask, b->cpumask))
3421 return false;
3422 return true;
3423}
3424
3425/**
3426 * init_worker_pool - initialize a newly zalloc'd worker_pool
3427 * @pool: worker_pool to initialize
3428 *
3429 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
3430 * Returns 0 on success, -errno on failure. Even on failure, all fields
3431 * inside @pool proper are initialized and put_unbound_pool() can be called
3432 * on @pool safely to release it.
3433 */
3434static int init_worker_pool(struct worker_pool *pool)
3435{
3436 spin_lock_init(&pool->lock);
3437 pool->id = -1;
3438 pool->cpu = -1;
3439 pool->node = NUMA_NO_NODE;
3440 pool->flags |= POOL_DISASSOCIATED;
3441 INIT_LIST_HEAD(&pool->worklist);
3442 INIT_LIST_HEAD(&pool->idle_list);
3443 hash_init(pool->busy_hash);
3444
3445 init_timer_deferrable(&pool->idle_timer);
3446 pool->idle_timer.function = idle_worker_timeout;
3447 pool->idle_timer.data = (unsigned long)pool;
3448
3449 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3450 (unsigned long)pool);
3451
3452 mutex_init(&pool->manager_arb);
3453 mutex_init(&pool->manager_mutex);
3454 idr_init(&pool->worker_idr);
3455
3456 INIT_HLIST_NODE(&pool->hash_node);
3457 pool->refcnt = 1;
3458
3459 /* shouldn't fail above this point */
3460 pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
3461 if (!pool->attrs)
3462 return -ENOMEM;
3463 return 0;
3464}
3465
3466static void rcu_free_pool(struct rcu_head *rcu)
3467{
3468 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3469
3470 idr_destroy(&pool->worker_idr);
3471 free_workqueue_attrs(pool->attrs);
3472 kfree(pool);
3473}
3474
3475/**
3476 * put_unbound_pool - put a worker_pool
3477 * @pool: worker_pool to put
3478 *
3479 * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
3480 * safe manner. get_unbound_pool() calls this function on its failure path
3481 * and this function should be able to release pools which went through,
3482 * successfully or not, init_worker_pool().
3483 *
3484 * Should be called with wq_pool_mutex held.
3485 */
3486static void put_unbound_pool(struct worker_pool *pool)
3487{
3488 struct worker *worker;
3489
3490 lockdep_assert_held(&wq_pool_mutex);
3491
3492 if (--pool->refcnt)
3493 return;
3494
3495 /* sanity checks */
3496 if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
3497 WARN_ON(!list_empty(&pool->worklist)))
3498 return;
3499
3500 /* release id and unhash */
3501 if (pool->id >= 0)
3502 idr_remove(&worker_pool_idr, pool->id);
3503 hash_del(&pool->hash_node);
3504
3505 /*
3506 * Become the manager and destroy all workers. Grabbing
3507 * manager_arb prevents @pool's workers from blocking on
3508 * manager_mutex.
3509 */
3510 mutex_lock(&pool->manager_arb);
3511 mutex_lock(&pool->manager_mutex);
3512 spin_lock_irq(&pool->lock);
3513
3514 while ((worker = first_worker(pool)))
3515 destroy_worker(worker);
3516 WARN_ON(pool->nr_workers || pool->nr_idle);
3517
3518 spin_unlock_irq(&pool->lock);
3519 mutex_unlock(&pool->manager_mutex);
3520 mutex_unlock(&pool->manager_arb);
3521
3522 /* shut down the timers */
3523 del_timer_sync(&pool->idle_timer);
3524 del_timer_sync(&pool->mayday_timer);
3525
3526 /* sched-RCU protected to allow dereferences from get_work_pool() */
3527 call_rcu_sched(&pool->rcu, rcu_free_pool);
3528}
3529
3530/**
3531 * get_unbound_pool - get a worker_pool with the specified attributes
3532 * @attrs: the attributes of the worker_pool to get
3533 *
3534 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3535 * reference count and return it. If there already is a matching
3536 * worker_pool, it will be used; otherwise, this function attempts to
3537 * create a new one. On failure, returns NULL.
3538 *
3539 * Should be called with wq_pool_mutex held.
3540 */
3541static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3542{
3543 u32 hash = wqattrs_hash(attrs);
3544 struct worker_pool *pool;
3545 int node;
3546
3547 lockdep_assert_held(&wq_pool_mutex);
3548
3549 /* do we already have a matching pool? */
3550 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3551 if (wqattrs_equal(pool->attrs, attrs)) {
3552 pool->refcnt++;
3553 goto out_unlock;
3554 }
3555 }
3556
3557 /* nope, create a new one */
3558 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
3559 if (!pool || init_worker_pool(pool) < 0)
3560 goto fail;
3561
3562 if (workqueue_freezing)
3563 pool->flags |= POOL_FREEZING;
3564
3565 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3566 copy_workqueue_attrs(pool->attrs, attrs);
3567
3568 /* if cpumask is contained inside a NUMA node, we belong to that node */
3569 if (wq_numa_enabled) {
3570 for_each_node(node) {
3571 if (cpumask_subset(pool->attrs->cpumask,
3572 wq_numa_possible_cpumask[node])) {
3573 pool->node = node;
3574 break;
3575 }
3576 }
3577 }
3578
3579 if (worker_pool_assign_id(pool) < 0)
3580 goto fail;
3581
3582 /* create and start the initial worker */
3583 if (create_and_start_worker(pool) < 0)
3584 goto fail;
3585
3586 /* install */
3587 hash_add(unbound_pool_hash, &pool->hash_node, hash);
3588out_unlock:
3589 return pool;
3590fail:
3591 if (pool)
3592 put_unbound_pool(pool);
3593 return NULL;
3594}
3595
3596static void rcu_free_pwq(struct rcu_head *rcu)
3597{
3598 kmem_cache_free(pwq_cache,
3599 container_of(rcu, struct pool_workqueue, rcu));
3600}
3601
3602/*
3603 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
3604 * and needs to be destroyed.
3605 */
3606static void pwq_unbound_release_workfn(struct work_struct *work)
3607{
3608 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
3609 unbound_release_work);
3610 struct workqueue_struct *wq = pwq->wq;
3611 struct worker_pool *pool = pwq->pool;
3612 bool is_last;
3613
3614 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
3615 return;
3616
3617 /*
3618 * Unlink @pwq. Synchronization against wq->mutex isn't strictly
3619 * necessary on release but do it anyway. It's easier to verify
3620 * and consistent with the linking path.
3621 */
3622 mutex_lock(&wq->mutex);
3623 list_del_rcu(&pwq->pwqs_node);
3624 is_last = list_empty(&wq->pwqs);
3625 mutex_unlock(&wq->mutex);
3626
3627 mutex_lock(&wq_pool_mutex);
3628 put_unbound_pool(pool);
3629 mutex_unlock(&wq_pool_mutex);
3630
3631 call_rcu_sched(&pwq->rcu, rcu_free_pwq);
3632
3633 /*
3634 * If we're the last pwq going away, @wq is already dead and no one
3635 * is gonna access it anymore. Free it.
3636 */
3637 if (is_last) {
3638 free_workqueue_attrs(wq->unbound_attrs);
3639 kfree(wq);
3640 }
3641}
3642
3643/**
3644 * pwq_adjust_max_active - update a pwq's max_active to the current setting
3645 * @pwq: target pool_workqueue
3646 *
3647 * If @pwq isn't freezing, set @pwq->max_active to the associated
3648 * workqueue's saved_max_active and activate delayed work items
3649 * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
3650 */
3651static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3652{
3653 struct workqueue_struct *wq = pwq->wq;
3654 bool freezable = wq->flags & WQ_FREEZABLE;
3655
3656 /* for @wq->saved_max_active */
3657 lockdep_assert_held(&wq->mutex);
3658
3659 /* fast exit for non-freezable wqs */
3660 if (!freezable && pwq->max_active == wq->saved_max_active)
3661 return;
3662
3663 spin_lock_irq(&pwq->pool->lock);
3664
3665 if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
3666 pwq->max_active = wq->saved_max_active;
3667
3668 while (!list_empty(&pwq->delayed_works) &&
3669 pwq->nr_active < pwq->max_active)
3670 pwq_activate_first_delayed(pwq);
3108 3671
3109 /* 3672 /*
3110 * Allocate enough room to align pwq and put an extra 3673 * Need to kick a worker after thawed or an unbound wq's
3111 * pointer at the end pointing back to the originally 3674 * max_active is bumped. It's a slow path. Do it always.
3112 * allocated pointer which will be used for free.
3113 */ 3675 */
3114 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3676 wake_up_worker(pwq->pool);
3115 if (ptr) { 3677 } else {
3116 wq->pool_wq.single = PTR_ALIGN(ptr, align); 3678 pwq->max_active = 0;
3117 *(void **)(wq->pool_wq.single + 1) = ptr; 3679 }
3680
3681 spin_unlock_irq(&pwq->pool->lock);
3682}
3683
3684/* initialize newly alloced @pwq which is associated with @wq and @pool */
3685static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
3686 struct worker_pool *pool)
3687{
3688 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3689
3690 memset(pwq, 0, sizeof(*pwq));
3691
3692 pwq->pool = pool;
3693 pwq->wq = wq;
3694 pwq->flush_color = -1;
3695 pwq->refcnt = 1;
3696 INIT_LIST_HEAD(&pwq->delayed_works);
3697 INIT_LIST_HEAD(&pwq->pwqs_node);
3698 INIT_LIST_HEAD(&pwq->mayday_node);
3699 INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
3700}
3701
3702/* sync @pwq with the current state of its associated wq and link it */
3703static void link_pwq(struct pool_workqueue *pwq)
3704{
3705 struct workqueue_struct *wq = pwq->wq;
3706
3707 lockdep_assert_held(&wq->mutex);
3708
3709 /* may be called multiple times, ignore if already linked */
3710 if (!list_empty(&pwq->pwqs_node))
3711 return;
3712
3713 /*
3714 * Set the matching work_color. This is synchronized with
3715 * wq->mutex to avoid confusing flush_workqueue().
3716 */
3717 pwq->work_color = wq->work_color;
3718
3719 /* sync max_active to the current setting */
3720 pwq_adjust_max_active(pwq);
3721
3722 /* link in @pwq */
3723 list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
3724}
3725
3726/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
3727static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
3728 const struct workqueue_attrs *attrs)
3729{
3730 struct worker_pool *pool;
3731 struct pool_workqueue *pwq;
3732
3733 lockdep_assert_held(&wq_pool_mutex);
3734
3735 pool = get_unbound_pool(attrs);
3736 if (!pool)
3737 return NULL;
3738
3739 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
3740 if (!pwq) {
3741 put_unbound_pool(pool);
3742 return NULL;
3743 }
3744
3745 init_pwq(pwq, wq, pool);
3746 return pwq;
3747}
3748
3749/* undo alloc_unbound_pwq(), used only in the error path */
3750static void free_unbound_pwq(struct pool_workqueue *pwq)
3751{
3752 lockdep_assert_held(&wq_pool_mutex);
3753
3754 if (pwq) {
3755 put_unbound_pool(pwq->pool);
3756 kmem_cache_free(pwq_cache, pwq);
3757 }
3758}
3759
3760/**
3761 * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
3762 * @attrs: the wq_attrs of interest
3763 * @node: the target NUMA node
3764 * @cpu_going_down: if >= 0, the CPU to consider as offline
3765 * @cpumask: outarg, the resulting cpumask
3766 *
3767 * Calculate the cpumask a workqueue with @attrs should use on @node. If
3768 * @cpu_going_down is >= 0, that cpu is considered offline during
3769 * calculation. The result is stored in @cpumask. This function returns
3770 * %true if the resulting @cpumask is different from @attrs->cpumask,
3771 * %false if equal.
3772 *
3773 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
3774 * enabled and @node has online CPUs requested by @attrs, the returned
3775 * cpumask is the intersection of the possible CPUs of @node and
3776 * @attrs->cpumask.
3777 *
3778 * The caller is responsible for ensuring that the cpumask of @node stays
3779 * stable.
3780 */
3781static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3782 int cpu_going_down, cpumask_t *cpumask)
3783{
3784 if (!wq_numa_enabled || attrs->no_numa)
3785 goto use_dfl;
3786
3787 /* does @node have any online CPUs @attrs wants? */
3788 cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
3789 if (cpu_going_down >= 0)
3790 cpumask_clear_cpu(cpu_going_down, cpumask);
3791
3792 if (cpumask_empty(cpumask))
3793 goto use_dfl;
3794
3795 /* yeap, return possible CPUs in @node that @attrs wants */
3796 cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
3797 return !cpumask_equal(cpumask, attrs->cpumask);
3798
3799use_dfl:
3800 cpumask_copy(cpumask, attrs->cpumask);
3801 return false;
3802}
3803
3804/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
3805static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3806 int node,
3807 struct pool_workqueue *pwq)
3808{
3809 struct pool_workqueue *old_pwq;
3810
3811 lockdep_assert_held(&wq->mutex);
3812
3813 /* link_pwq() can handle duplicate calls */
3814 link_pwq(pwq);
3815
3816 old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3817 rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
3818 return old_pwq;
3819}
3820
3821/**
3822 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
3823 * @wq: the target workqueue
3824 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
3825 *
3826 * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
3827 * machines, this function maps a separate pwq to each NUMA node with
3828 * possibles CPUs in @attrs->cpumask so that work items are affine to the
3829 * NUMA node it was issued on. Older pwqs are released as in-flight work
3830 * items finish. Note that a work item which repeatedly requeues itself
3831 * back-to-back will stay on its current pwq.
3832 *
3833 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on
3834 * failure.
3835 */
3836int apply_workqueue_attrs(struct workqueue_struct *wq,
3837 const struct workqueue_attrs *attrs)
3838{
3839 struct workqueue_attrs *new_attrs, *tmp_attrs;
3840 struct pool_workqueue **pwq_tbl, *dfl_pwq;
3841 int node, ret;
3842
3843 /* only unbound workqueues can change attributes */
3844 if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
3845 return -EINVAL;
3846
3847 /* creating multiple pwqs breaks ordering guarantee */
3848 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3849 return -EINVAL;
3850
3851 pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
3852 new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3853 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3854 if (!pwq_tbl || !new_attrs || !tmp_attrs)
3855 goto enomem;
3856
3857 /* make a copy of @attrs and sanitize it */
3858 copy_workqueue_attrs(new_attrs, attrs);
3859 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
3860
3861 /*
3862 * We may create multiple pwqs with differing cpumasks. Make a
3863 * copy of @new_attrs which will be modified and used to obtain
3864 * pools.
3865 */
3866 copy_workqueue_attrs(tmp_attrs, new_attrs);
3867
3868 /*
3869 * CPUs should stay stable across pwq creations and installations.
3870 * Pin CPUs, determine the target cpumask for each node and create
3871 * pwqs accordingly.
3872 */
3873 get_online_cpus();
3874
3875 mutex_lock(&wq_pool_mutex);
3876
3877 /*
3878 * If something goes wrong during CPU up/down, we'll fall back to
3879 * the default pwq covering whole @attrs->cpumask. Always create
3880 * it even if we don't use it immediately.
3881 */
3882 dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
3883 if (!dfl_pwq)
3884 goto enomem_pwq;
3885
3886 for_each_node(node) {
3887 if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
3888 pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
3889 if (!pwq_tbl[node])
3890 goto enomem_pwq;
3891 } else {
3892 dfl_pwq->refcnt++;
3893 pwq_tbl[node] = dfl_pwq;
3118 } 3894 }
3119 } 3895 }
3120 3896
3121 /* just in case, make sure it's actually aligned */ 3897 mutex_unlock(&wq_pool_mutex);
3122 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); 3898
3123 return wq->pool_wq.v ? 0 : -ENOMEM; 3899 /* all pwqs have been created successfully, let's install'em */
3900 mutex_lock(&wq->mutex);
3901
3902 copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
3903
3904 /* save the previous pwq and install the new one */
3905 for_each_node(node)
3906 pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
3907
3908 /* @dfl_pwq might not have been used, ensure it's linked */
3909 link_pwq(dfl_pwq);
3910 swap(wq->dfl_pwq, dfl_pwq);
3911
3912 mutex_unlock(&wq->mutex);
3913
3914 /* put the old pwqs */
3915 for_each_node(node)
3916 put_pwq_unlocked(pwq_tbl[node]);
3917 put_pwq_unlocked(dfl_pwq);
3918
3919 put_online_cpus();
3920 ret = 0;
3921 /* fall through */
3922out_free:
3923 free_workqueue_attrs(tmp_attrs);
3924 free_workqueue_attrs(new_attrs);
3925 kfree(pwq_tbl);
3926 return ret;
3927
3928enomem_pwq:
3929 free_unbound_pwq(dfl_pwq);
3930 for_each_node(node)
3931 if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
3932 free_unbound_pwq(pwq_tbl[node]);
3933 mutex_unlock(&wq_pool_mutex);
3934 put_online_cpus();
3935enomem:
3936 ret = -ENOMEM;
3937 goto out_free;
3124} 3938}
3125 3939
3126static void free_pwqs(struct workqueue_struct *wq) 3940/**
3941 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
3942 * @wq: the target workqueue
3943 * @cpu: the CPU coming up or going down
3944 * @online: whether @cpu is coming up or going down
3945 *
3946 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
3947 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
3948 * @wq accordingly.
3949 *
3950 * If NUMA affinity can't be adjusted due to memory allocation failure, it
3951 * falls back to @wq->dfl_pwq which may not be optimal but is always
3952 * correct.
3953 *
3954 * Note that when the last allowed CPU of a NUMA node goes offline for a
3955 * workqueue with a cpumask spanning multiple nodes, the workers which were
3956 * already executing the work items for the workqueue will lose their CPU
3957 * affinity and may execute on any CPU. This is similar to how per-cpu
3958 * workqueues behave on CPU_DOWN. If a workqueue user wants strict
3959 * affinity, it's the user's responsibility to flush the work item from
3960 * CPU_DOWN_PREPARE.
3961 */
3962static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
3963 bool online)
3127{ 3964{
3128 if (!(wq->flags & WQ_UNBOUND)) 3965 int node = cpu_to_node(cpu);
3129 free_percpu(wq->pool_wq.pcpu); 3966 int cpu_off = online ? -1 : cpu;
3130 else if (wq->pool_wq.single) { 3967 struct pool_workqueue *old_pwq = NULL, *pwq;
3131 /* the pointer to free is stored right after the pwq */ 3968 struct workqueue_attrs *target_attrs;
3132 kfree(*(void **)(wq->pool_wq.single + 1)); 3969 cpumask_t *cpumask;
3970
3971 lockdep_assert_held(&wq_pool_mutex);
3972
3973 if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
3974 return;
3975
3976 /*
3977 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
3978 * Let's use a preallocated one. The following buf is protected by
3979 * CPU hotplug exclusion.
3980 */
3981 target_attrs = wq_update_unbound_numa_attrs_buf;
3982 cpumask = target_attrs->cpumask;
3983
3984 mutex_lock(&wq->mutex);
3985 if (wq->unbound_attrs->no_numa)
3986 goto out_unlock;
3987
3988 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3989 pwq = unbound_pwq_by_node(wq, node);
3990
3991 /*
3992 * Let's determine what needs to be done. If the target cpumask is
3993 * different from wq's, we need to compare it to @pwq's and create
3994 * a new one if they don't match. If the target cpumask equals
3995 * wq's, the default pwq should be used. If @pwq is already the
3996 * default one, nothing to do; otherwise, install the default one.
3997 */
3998 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
3999 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
4000 goto out_unlock;
4001 } else {
4002 if (pwq == wq->dfl_pwq)
4003 goto out_unlock;
4004 else
4005 goto use_dfl_pwq;
4006 }
4007
4008 mutex_unlock(&wq->mutex);
4009
4010 /* create a new pwq */
4011 pwq = alloc_unbound_pwq(wq, target_attrs);
4012 if (!pwq) {
4013 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4014 wq->name);
4015 goto out_unlock;
4016 }
4017
4018 /*
4019 * Install the new pwq. As this function is called only from CPU
4020 * hotplug callbacks and applying a new attrs is wrapped with
4021 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
4022 * inbetween.
4023 */
4024 mutex_lock(&wq->mutex);
4025 old_pwq = numa_pwq_tbl_install(wq, node, pwq);
4026 goto out_unlock;
4027
4028use_dfl_pwq:
4029 spin_lock_irq(&wq->dfl_pwq->pool->lock);
4030 get_pwq(wq->dfl_pwq);
4031 spin_unlock_irq(&wq->dfl_pwq->pool->lock);
4032 old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
4033out_unlock:
4034 mutex_unlock(&wq->mutex);
4035 put_pwq_unlocked(old_pwq);
4036}
4037
4038static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4039{
4040 bool highpri = wq->flags & WQ_HIGHPRI;
4041 int cpu;
4042
4043 if (!(wq->flags & WQ_UNBOUND)) {
4044 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
4045 if (!wq->cpu_pwqs)
4046 return -ENOMEM;
4047
4048 for_each_possible_cpu(cpu) {
4049 struct pool_workqueue *pwq =
4050 per_cpu_ptr(wq->cpu_pwqs, cpu);
4051 struct worker_pool *cpu_pools =
4052 per_cpu(cpu_worker_pools, cpu);
4053
4054 init_pwq(pwq, wq, &cpu_pools[highpri]);
4055
4056 mutex_lock(&wq->mutex);
4057 link_pwq(pwq);
4058 mutex_unlock(&wq->mutex);
4059 }
4060 return 0;
4061 } else {
4062 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
3133 } 4063 }
3134} 4064}
3135 4065
@@ -3151,30 +4081,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3151 struct lock_class_key *key, 4081 struct lock_class_key *key,
3152 const char *lock_name, ...) 4082 const char *lock_name, ...)
3153{ 4083{
3154 va_list args, args1; 4084 size_t tbl_size = 0;
4085 va_list args;
3155 struct workqueue_struct *wq; 4086 struct workqueue_struct *wq;
3156 unsigned int cpu; 4087 struct pool_workqueue *pwq;
3157 size_t namelen;
3158 4088
3159 /* determine namelen, allocate wq and format name */ 4089 /* allocate wq and format name */
3160 va_start(args, lock_name); 4090 if (flags & WQ_UNBOUND)
3161 va_copy(args1, args); 4091 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
3162 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3163 4092
3164 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); 4093 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
3165 if (!wq) 4094 if (!wq)
3166 goto err; 4095 return NULL;
3167 4096
3168 vsnprintf(wq->name, namelen, fmt, args1); 4097 if (flags & WQ_UNBOUND) {
3169 va_end(args); 4098 wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3170 va_end(args1); 4099 if (!wq->unbound_attrs)
4100 goto err_free_wq;
4101 }
3171 4102
3172 /* 4103 va_start(args, lock_name);
3173 * Workqueues which may be used during memory reclaim should 4104 vsnprintf(wq->name, sizeof(wq->name), fmt, args);
3174 * have a rescuer to guarantee forward progress. 4105 va_end(args);
3175 */
3176 if (flags & WQ_MEM_RECLAIM)
3177 flags |= WQ_RESCUER;
3178 4106
3179 max_active = max_active ?: WQ_DFL_ACTIVE; 4107 max_active = max_active ?: WQ_DFL_ACTIVE;
3180 max_active = wq_clamp_max_active(max_active, flags, wq->name); 4108 max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3182,71 +4110,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3182 /* init wq */ 4110 /* init wq */
3183 wq->flags = flags; 4111 wq->flags = flags;
3184 wq->saved_max_active = max_active; 4112 wq->saved_max_active = max_active;
3185 mutex_init(&wq->flush_mutex); 4113 mutex_init(&wq->mutex);
3186 atomic_set(&wq->nr_pwqs_to_flush, 0); 4114 atomic_set(&wq->nr_pwqs_to_flush, 0);
4115 INIT_LIST_HEAD(&wq->pwqs);
3187 INIT_LIST_HEAD(&wq->flusher_queue); 4116 INIT_LIST_HEAD(&wq->flusher_queue);
3188 INIT_LIST_HEAD(&wq->flusher_overflow); 4117 INIT_LIST_HEAD(&wq->flusher_overflow);
4118 INIT_LIST_HEAD(&wq->maydays);
3189 4119
3190 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 4120 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3191 INIT_LIST_HEAD(&wq->list); 4121 INIT_LIST_HEAD(&wq->list);
3192 4122
3193 if (alloc_pwqs(wq) < 0) 4123 if (alloc_and_link_pwqs(wq) < 0)
3194 goto err; 4124 goto err_free_wq;
3195
3196 for_each_pwq_cpu(cpu, wq) {
3197 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3198
3199 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3200 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3201 pwq->wq = wq;
3202 pwq->flush_color = -1;
3203 pwq->max_active = max_active;
3204 INIT_LIST_HEAD(&pwq->delayed_works);
3205 }
3206 4125
3207 if (flags & WQ_RESCUER) { 4126 /*
4127 * Workqueues which may be used during memory reclaim should
4128 * have a rescuer to guarantee forward progress.
4129 */
4130 if (flags & WQ_MEM_RECLAIM) {
3208 struct worker *rescuer; 4131 struct worker *rescuer;
3209 4132
3210 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) 4133 rescuer = alloc_worker();
3211 goto err;
3212
3213 wq->rescuer = rescuer = alloc_worker();
3214 if (!rescuer) 4134 if (!rescuer)
3215 goto err; 4135 goto err_destroy;
3216 4136
3217 rescuer->rescue_wq = wq; 4137 rescuer->rescue_wq = wq;
3218 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", 4138 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3219 wq->name); 4139 wq->name);
3220 if (IS_ERR(rescuer->task)) 4140 if (IS_ERR(rescuer->task)) {
3221 goto err; 4141 kfree(rescuer);
4142 goto err_destroy;
4143 }
3222 4144
3223 rescuer->task->flags |= PF_THREAD_BOUND; 4145 wq->rescuer = rescuer;
4146 rescuer->task->flags |= PF_NO_SETAFFINITY;
3224 wake_up_process(rescuer->task); 4147 wake_up_process(rescuer->task);
3225 } 4148 }
3226 4149
4150 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
4151 goto err_destroy;
4152
3227 /* 4153 /*
3228 * workqueue_lock protects global freeze state and workqueues 4154 * wq_pool_mutex protects global freeze state and workqueues list.
3229 * list. Grab it, set max_active accordingly and add the new 4155 * Grab it, adjust max_active and add the new @wq to workqueues
3230 * workqueue to workqueues list. 4156 * list.
3231 */ 4157 */
3232 spin_lock(&workqueue_lock); 4158 mutex_lock(&wq_pool_mutex);
3233 4159
3234 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 4160 mutex_lock(&wq->mutex);
3235 for_each_pwq_cpu(cpu, wq) 4161 for_each_pwq(pwq, wq)
3236 get_pwq(cpu, wq)->max_active = 0; 4162 pwq_adjust_max_active(pwq);
4163 mutex_unlock(&wq->mutex);
3237 4164
3238 list_add(&wq->list, &workqueues); 4165 list_add(&wq->list, &workqueues);
3239 4166
3240 spin_unlock(&workqueue_lock); 4167 mutex_unlock(&wq_pool_mutex);
3241 4168
3242 return wq; 4169 return wq;
3243err: 4170
3244 if (wq) { 4171err_free_wq:
3245 free_pwqs(wq); 4172 free_workqueue_attrs(wq->unbound_attrs);
3246 free_mayday_mask(wq->mayday_mask); 4173 kfree(wq);
3247 kfree(wq->rescuer); 4174 return NULL;
3248 kfree(wq); 4175err_destroy:
3249 } 4176 destroy_workqueue(wq);
3250 return NULL; 4177 return NULL;
3251} 4178}
3252EXPORT_SYMBOL_GPL(__alloc_workqueue_key); 4179EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
@@ -3259,60 +4186,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3259 */ 4186 */
3260void destroy_workqueue(struct workqueue_struct *wq) 4187void destroy_workqueue(struct workqueue_struct *wq)
3261{ 4188{
3262 unsigned int cpu; 4189 struct pool_workqueue *pwq;
4190 int node;
3263 4191
3264 /* drain it before proceeding with destruction */ 4192 /* drain it before proceeding with destruction */
3265 drain_workqueue(wq); 4193 drain_workqueue(wq);
3266 4194
4195 /* sanity checks */
4196 mutex_lock(&wq->mutex);
4197 for_each_pwq(pwq, wq) {
4198 int i;
4199
4200 for (i = 0; i < WORK_NR_COLORS; i++) {
4201 if (WARN_ON(pwq->nr_in_flight[i])) {
4202 mutex_unlock(&wq->mutex);
4203 return;
4204 }
4205 }
4206
4207 if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
4208 WARN_ON(pwq->nr_active) ||
4209 WARN_ON(!list_empty(&pwq->delayed_works))) {
4210 mutex_unlock(&wq->mutex);
4211 return;
4212 }
4213 }
4214 mutex_unlock(&wq->mutex);
4215
3267 /* 4216 /*
3268 * wq list is used to freeze wq, remove from list after 4217 * wq list is used to freeze wq, remove from list after
3269 * flushing is complete in case freeze races us. 4218 * flushing is complete in case freeze races us.
3270 */ 4219 */
3271 spin_lock(&workqueue_lock); 4220 mutex_lock(&wq_pool_mutex);
3272 list_del(&wq->list); 4221 list_del_init(&wq->list);
3273 spin_unlock(&workqueue_lock); 4222 mutex_unlock(&wq_pool_mutex);
3274 4223
3275 /* sanity check */ 4224 workqueue_sysfs_unregister(wq);
3276 for_each_pwq_cpu(cpu, wq) {
3277 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3278 int i;
3279 4225
3280 for (i = 0; i < WORK_NR_COLORS; i++) 4226 if (wq->rescuer) {
3281 BUG_ON(pwq->nr_in_flight[i]);
3282 BUG_ON(pwq->nr_active);
3283 BUG_ON(!list_empty(&pwq->delayed_works));
3284 }
3285
3286 if (wq->flags & WQ_RESCUER) {
3287 kthread_stop(wq->rescuer->task); 4227 kthread_stop(wq->rescuer->task);
3288 free_mayday_mask(wq->mayday_mask);
3289 kfree(wq->rescuer); 4228 kfree(wq->rescuer);
4229 wq->rescuer = NULL;
3290 } 4230 }
3291 4231
3292 free_pwqs(wq); 4232 if (!(wq->flags & WQ_UNBOUND)) {
3293 kfree(wq); 4233 /*
3294} 4234 * The base ref is never dropped on per-cpu pwqs. Directly
3295EXPORT_SYMBOL_GPL(destroy_workqueue); 4235 * free the pwqs and wq.
3296 4236 */
3297/** 4237 free_percpu(wq->cpu_pwqs);
3298 * pwq_set_max_active - adjust max_active of a pwq 4238 kfree(wq);
3299 * @pwq: target pool_workqueue 4239 } else {
3300 * @max_active: new max_active value. 4240 /*
3301 * 4241 * We're the sole accessor of @wq at this point. Directly
3302 * Set @pwq->max_active to @max_active and activate delayed works if 4242 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
3303 * increased. 4243 * @wq will be freed when the last pwq is released.
3304 * 4244 */
3305 * CONTEXT: 4245 for_each_node(node) {
3306 * spin_lock_irq(pool->lock). 4246 pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3307 */ 4247 RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
3308static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) 4248 put_pwq_unlocked(pwq);
3309{ 4249 }
3310 pwq->max_active = max_active;
3311 4250
3312 while (!list_empty(&pwq->delayed_works) && 4251 /*
3313 pwq->nr_active < pwq->max_active) 4252 * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
3314 pwq_activate_first_delayed(pwq); 4253 * put. Don't access it afterwards.
4254 */
4255 pwq = wq->dfl_pwq;
4256 wq->dfl_pwq = NULL;
4257 put_pwq_unlocked(pwq);
4258 }
3315} 4259}
4260EXPORT_SYMBOL_GPL(destroy_workqueue);
3316 4261
3317/** 4262/**
3318 * workqueue_set_max_active - adjust max_active of a workqueue 4263 * workqueue_set_max_active - adjust max_active of a workqueue
@@ -3326,30 +4271,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3326 */ 4271 */
3327void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) 4272void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3328{ 4273{
3329 unsigned int cpu; 4274 struct pool_workqueue *pwq;
4275
4276 /* disallow meddling with max_active for ordered workqueues */
4277 if (WARN_ON(wq->flags & __WQ_ORDERED))
4278 return;
3330 4279
3331 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); 4280 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3332 4281
3333 spin_lock(&workqueue_lock); 4282 mutex_lock(&wq->mutex);
3334 4283
3335 wq->saved_max_active = max_active; 4284 wq->saved_max_active = max_active;
3336 4285
3337 for_each_pwq_cpu(cpu, wq) { 4286 for_each_pwq(pwq, wq)
3338 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4287 pwq_adjust_max_active(pwq);
3339 struct worker_pool *pool = pwq->pool;
3340
3341 spin_lock_irq(&pool->lock);
3342 4288
3343 if (!(wq->flags & WQ_FREEZABLE) || 4289 mutex_unlock(&wq->mutex);
3344 !(pool->flags & POOL_FREEZING)) 4290}
3345 pwq_set_max_active(pwq, max_active); 4291EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3346 4292
3347 spin_unlock_irq(&pool->lock); 4293/**
3348 } 4294 * current_is_workqueue_rescuer - is %current workqueue rescuer?
4295 *
4296 * Determine whether %current is a workqueue rescuer. Can be used from
4297 * work functions to determine whether it's being run off the rescuer task.
4298 */
4299bool current_is_workqueue_rescuer(void)
4300{
4301 struct worker *worker = current_wq_worker();
3349 4302
3350 spin_unlock(&workqueue_lock); 4303 return worker && worker->rescue_wq;
3351} 4304}
3352EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3353 4305
3354/** 4306/**
3355 * workqueue_congested - test whether a workqueue is congested 4307 * workqueue_congested - test whether a workqueue is congested
@@ -3360,14 +4312,34 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3360 * no synchronization around this function and the test result is 4312 * no synchronization around this function and the test result is
3361 * unreliable and only useful as advisory hints or for debugging. 4313 * unreliable and only useful as advisory hints or for debugging.
3362 * 4314 *
4315 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
4316 * Note that both per-cpu and unbound workqueues may be associated with
4317 * multiple pool_workqueues which have separate congested states. A
4318 * workqueue being congested on one CPU doesn't mean the workqueue is also
4319 * contested on other CPUs / NUMA nodes.
4320 *
3363 * RETURNS: 4321 * RETURNS:
3364 * %true if congested, %false otherwise. 4322 * %true if congested, %false otherwise.
3365 */ 4323 */
3366bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 4324bool workqueue_congested(int cpu, struct workqueue_struct *wq)
3367{ 4325{
3368 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4326 struct pool_workqueue *pwq;
4327 bool ret;
4328
4329 rcu_read_lock_sched();
4330
4331 if (cpu == WORK_CPU_UNBOUND)
4332 cpu = smp_processor_id();
4333
4334 if (!(wq->flags & WQ_UNBOUND))
4335 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
4336 else
4337 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
3369 4338
3370 return !list_empty(&pwq->delayed_works); 4339 ret = !list_empty(&pwq->delayed_works);
4340 rcu_read_unlock_sched();
4341
4342 return ret;
3371} 4343}
3372EXPORT_SYMBOL_GPL(workqueue_congested); 4344EXPORT_SYMBOL_GPL(workqueue_congested);
3373 4345
@@ -3384,24 +4356,104 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
3384 */ 4356 */
3385unsigned int work_busy(struct work_struct *work) 4357unsigned int work_busy(struct work_struct *work)
3386{ 4358{
3387 struct worker_pool *pool = get_work_pool(work); 4359 struct worker_pool *pool;
3388 unsigned long flags; 4360 unsigned long flags;
3389 unsigned int ret = 0; 4361 unsigned int ret = 0;
3390 4362
3391 if (work_pending(work)) 4363 if (work_pending(work))
3392 ret |= WORK_BUSY_PENDING; 4364 ret |= WORK_BUSY_PENDING;
3393 4365
4366 local_irq_save(flags);
4367 pool = get_work_pool(work);
3394 if (pool) { 4368 if (pool) {
3395 spin_lock_irqsave(&pool->lock, flags); 4369 spin_lock(&pool->lock);
3396 if (find_worker_executing_work(pool, work)) 4370 if (find_worker_executing_work(pool, work))
3397 ret |= WORK_BUSY_RUNNING; 4371 ret |= WORK_BUSY_RUNNING;
3398 spin_unlock_irqrestore(&pool->lock, flags); 4372 spin_unlock(&pool->lock);
3399 } 4373 }
4374 local_irq_restore(flags);
3400 4375
3401 return ret; 4376 return ret;
3402} 4377}
3403EXPORT_SYMBOL_GPL(work_busy); 4378EXPORT_SYMBOL_GPL(work_busy);
3404 4379
4380/**
4381 * set_worker_desc - set description for the current work item
4382 * @fmt: printf-style format string
4383 * @...: arguments for the format string
4384 *
4385 * This function can be called by a running work function to describe what
4386 * the work item is about. If the worker task gets dumped, this
4387 * information will be printed out together to help debugging. The
4388 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
4389 */
4390void set_worker_desc(const char *fmt, ...)
4391{
4392 struct worker *worker = current_wq_worker();
4393 va_list args;
4394
4395 if (worker) {
4396 va_start(args, fmt);
4397 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
4398 va_end(args);
4399 worker->desc_valid = true;
4400 }
4401}
4402
4403/**
4404 * print_worker_info - print out worker information and description
4405 * @log_lvl: the log level to use when printing
4406 * @task: target task
4407 *
4408 * If @task is a worker and currently executing a work item, print out the
4409 * name of the workqueue being serviced and worker description set with
4410 * set_worker_desc() by the currently executing work item.
4411 *
4412 * This function can be safely called on any task as long as the
4413 * task_struct itself is accessible. While safe, this function isn't
4414 * synchronized and may print out mixups or garbages of limited length.
4415 */
4416void print_worker_info(const char *log_lvl, struct task_struct *task)
4417{
4418 work_func_t *fn = NULL;
4419 char name[WQ_NAME_LEN] = { };
4420 char desc[WORKER_DESC_LEN] = { };
4421 struct pool_workqueue *pwq = NULL;
4422 struct workqueue_struct *wq = NULL;
4423 bool desc_valid = false;
4424 struct worker *worker;
4425
4426 if (!(task->flags & PF_WQ_WORKER))
4427 return;
4428
4429 /*
4430 * This function is called without any synchronization and @task
4431 * could be in any state. Be careful with dereferences.
4432 */
4433 worker = probe_kthread_data(task);
4434
4435 /*
4436 * Carefully copy the associated workqueue's workfn and name. Keep
4437 * the original last '\0' in case the original contains garbage.
4438 */
4439 probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
4440 probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
4441 probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
4442 probe_kernel_read(name, wq->name, sizeof(name) - 1);
4443
4444 /* copy worker description */
4445 probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
4446 if (desc_valid)
4447 probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
4448
4449 if (fn || name[0] || desc[0]) {
4450 printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
4451 if (desc[0])
4452 pr_cont(" (%s)", desc);
4453 pr_cont("\n");
4454 }
4455}
4456
3405/* 4457/*
3406 * CPU hotplug. 4458 * CPU hotplug.
3407 * 4459 *
@@ -3422,53 +4474,153 @@ static void wq_unbind_fn(struct work_struct *work)
3422 int cpu = smp_processor_id(); 4474 int cpu = smp_processor_id();
3423 struct worker_pool *pool; 4475 struct worker_pool *pool;
3424 struct worker *worker; 4476 struct worker *worker;
3425 int i; 4477 int wi;
3426 4478
3427 for_each_std_worker_pool(pool, cpu) { 4479 for_each_cpu_worker_pool(pool, cpu) {
3428 BUG_ON(cpu != smp_processor_id()); 4480 WARN_ON_ONCE(cpu != smp_processor_id());
3429 4481
3430 mutex_lock(&pool->assoc_mutex); 4482 mutex_lock(&pool->manager_mutex);
3431 spin_lock_irq(&pool->lock); 4483 spin_lock_irq(&pool->lock);
3432 4484
3433 /* 4485 /*
3434 * We've claimed all manager positions. Make all workers 4486 * We've blocked all manager operations. Make all workers
3435 * unbound and set DISASSOCIATED. Before this, all workers 4487 * unbound and set DISASSOCIATED. Before this, all workers
3436 * except for the ones which are still executing works from 4488 * except for the ones which are still executing works from
3437 * before the last CPU down must be on the cpu. After 4489 * before the last CPU down must be on the cpu. After
3438 * this, they may become diasporas. 4490 * this, they may become diasporas.
3439 */ 4491 */
3440 list_for_each_entry(worker, &pool->idle_list, entry) 4492 for_each_pool_worker(worker, wi, pool)
3441 worker->flags |= WORKER_UNBOUND;
3442
3443 for_each_busy_worker(worker, i, pool)
3444 worker->flags |= WORKER_UNBOUND; 4493 worker->flags |= WORKER_UNBOUND;
3445 4494
3446 pool->flags |= POOL_DISASSOCIATED; 4495 pool->flags |= POOL_DISASSOCIATED;
3447 4496
3448 spin_unlock_irq(&pool->lock); 4497 spin_unlock_irq(&pool->lock);
3449 mutex_unlock(&pool->assoc_mutex); 4498 mutex_unlock(&pool->manager_mutex);
4499
4500 /*
4501 * Call schedule() so that we cross rq->lock and thus can
4502 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
4503 * This is necessary as scheduler callbacks may be invoked
4504 * from other cpus.
4505 */
4506 schedule();
4507
4508 /*
4509 * Sched callbacks are disabled now. Zap nr_running.
4510 * After this, nr_running stays zero and need_more_worker()
4511 * and keep_working() are always true as long as the
4512 * worklist is not empty. This pool now behaves as an
4513 * unbound (in terms of concurrency management) pool which
4514 * are served by workers tied to the pool.
4515 */
4516 atomic_set(&pool->nr_running, 0);
4517
4518 /*
4519 * With concurrency management just turned off, a busy
4520 * worker blocking could lead to lengthy stalls. Kick off
4521 * unbound chain execution of currently pending work items.
4522 */
4523 spin_lock_irq(&pool->lock);
4524 wake_up_worker(pool);
4525 spin_unlock_irq(&pool->lock);
3450 } 4526 }
4527}
3451 4528
3452 /* 4529/**
3453 * Call schedule() so that we cross rq->lock and thus can guarantee 4530 * rebind_workers - rebind all workers of a pool to the associated CPU
3454 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary 4531 * @pool: pool of interest
3455 * as scheduler callbacks may be invoked from other cpus. 4532 *
3456 */ 4533 * @pool->cpu is coming online. Rebind all workers to the CPU.
3457 schedule(); 4534 */
4535static void rebind_workers(struct worker_pool *pool)
4536{
4537 struct worker *worker;
4538 int wi;
4539
4540 lockdep_assert_held(&pool->manager_mutex);
3458 4541
3459 /* 4542 /*
3460 * Sched callbacks are disabled now. Zap nr_running. After this, 4543 * Restore CPU affinity of all workers. As all idle workers should
3461 * nr_running stays zero and need_more_worker() and keep_working() 4544 * be on the run-queue of the associated CPU before any local
3462 * are always true as long as the worklist is not empty. Pools on 4545 * wake-ups for concurrency management happen, restore CPU affinty
3463 * @cpu now behave as unbound (in terms of concurrency management) 4546 * of all workers first and then clear UNBOUND. As we're called
3464 * pools which are served by workers tied to the CPU. 4547 * from CPU_ONLINE, the following shouldn't fail.
3465 *
3466 * On return from this function, the current worker would trigger
3467 * unbound chain execution of pending work items if other workers
3468 * didn't already.
3469 */ 4548 */
3470 for_each_std_worker_pool(pool, cpu) 4549 for_each_pool_worker(worker, wi, pool)
3471 atomic_set(&pool->nr_running, 0); 4550 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4551 pool->attrs->cpumask) < 0);
4552
4553 spin_lock_irq(&pool->lock);
4554
4555 for_each_pool_worker(worker, wi, pool) {
4556 unsigned int worker_flags = worker->flags;
4557
4558 /*
4559 * A bound idle worker should actually be on the runqueue
4560 * of the associated CPU for local wake-ups targeting it to
4561 * work. Kick all idle workers so that they migrate to the
4562 * associated CPU. Doing this in the same loop as
4563 * replacing UNBOUND with REBOUND is safe as no worker will
4564 * be bound before @pool->lock is released.
4565 */
4566 if (worker_flags & WORKER_IDLE)
4567 wake_up_process(worker->task);
4568
4569 /*
4570 * We want to clear UNBOUND but can't directly call
4571 * worker_clr_flags() or adjust nr_running. Atomically
4572 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
4573 * @worker will clear REBOUND using worker_clr_flags() when
4574 * it initiates the next execution cycle thus restoring
4575 * concurrency management. Note that when or whether
4576 * @worker clears REBOUND doesn't affect correctness.
4577 *
4578 * ACCESS_ONCE() is necessary because @worker->flags may be
4579 * tested without holding any lock in
4580 * wq_worker_waking_up(). Without it, NOT_RUNNING test may
4581 * fail incorrectly leading to premature concurrency
4582 * management operations.
4583 */
4584 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
4585 worker_flags |= WORKER_REBOUND;
4586 worker_flags &= ~WORKER_UNBOUND;
4587 ACCESS_ONCE(worker->flags) = worker_flags;
4588 }
4589
4590 spin_unlock_irq(&pool->lock);
4591}
4592
4593/**
4594 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
4595 * @pool: unbound pool of interest
4596 * @cpu: the CPU which is coming up
4597 *
4598 * An unbound pool may end up with a cpumask which doesn't have any online
4599 * CPUs. When a worker of such pool get scheduled, the scheduler resets
4600 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
4601 * online CPU before, cpus_allowed of all its workers should be restored.
4602 */
4603static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4604{
4605 static cpumask_t cpumask;
4606 struct worker *worker;
4607 int wi;
4608
4609 lockdep_assert_held(&pool->manager_mutex);
4610
4611 /* is @cpu allowed for @pool? */
4612 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
4613 return;
4614
4615 /* is @cpu the only online CPU? */
4616 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
4617 if (cpumask_weight(&cpumask) != 1)
4618 return;
4619
4620 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4621 for_each_pool_worker(worker, wi, pool)
4622 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4623 pool->attrs->cpumask) < 0);
3472} 4624}
3473 4625
3474/* 4626/*
@@ -3479,39 +4631,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3479 unsigned long action, 4631 unsigned long action,
3480 void *hcpu) 4632 void *hcpu)
3481{ 4633{
3482 unsigned int cpu = (unsigned long)hcpu; 4634 int cpu = (unsigned long)hcpu;
3483 struct worker_pool *pool; 4635 struct worker_pool *pool;
4636 struct workqueue_struct *wq;
4637 int pi;
3484 4638
3485 switch (action & ~CPU_TASKS_FROZEN) { 4639 switch (action & ~CPU_TASKS_FROZEN) {
3486 case CPU_UP_PREPARE: 4640 case CPU_UP_PREPARE:
3487 for_each_std_worker_pool(pool, cpu) { 4641 for_each_cpu_worker_pool(pool, cpu) {
3488 struct worker *worker;
3489
3490 if (pool->nr_workers) 4642 if (pool->nr_workers)
3491 continue; 4643 continue;
3492 4644 if (create_and_start_worker(pool) < 0)
3493 worker = create_worker(pool);
3494 if (!worker)
3495 return NOTIFY_BAD; 4645 return NOTIFY_BAD;
3496
3497 spin_lock_irq(&pool->lock);
3498 start_worker(worker);
3499 spin_unlock_irq(&pool->lock);
3500 } 4646 }
3501 break; 4647 break;
3502 4648
3503 case CPU_DOWN_FAILED: 4649 case CPU_DOWN_FAILED:
3504 case CPU_ONLINE: 4650 case CPU_ONLINE:
3505 for_each_std_worker_pool(pool, cpu) { 4651 mutex_lock(&wq_pool_mutex);
3506 mutex_lock(&pool->assoc_mutex);
3507 spin_lock_irq(&pool->lock);
3508 4652
3509 pool->flags &= ~POOL_DISASSOCIATED; 4653 for_each_pool(pool, pi) {
3510 rebind_workers(pool); 4654 mutex_lock(&pool->manager_mutex);
3511 4655
3512 spin_unlock_irq(&pool->lock); 4656 if (pool->cpu == cpu) {
3513 mutex_unlock(&pool->assoc_mutex); 4657 spin_lock_irq(&pool->lock);
4658 pool->flags &= ~POOL_DISASSOCIATED;
4659 spin_unlock_irq(&pool->lock);
4660
4661 rebind_workers(pool);
4662 } else if (pool->cpu < 0) {
4663 restore_unbound_workers_cpumask(pool, cpu);
4664 }
4665
4666 mutex_unlock(&pool->manager_mutex);
3514 } 4667 }
4668
4669 /* update NUMA affinity of unbound workqueues */
4670 list_for_each_entry(wq, &workqueues, list)
4671 wq_update_unbound_numa(wq, cpu, true);
4672
4673 mutex_unlock(&wq_pool_mutex);
3515 break; 4674 break;
3516 } 4675 }
3517 return NOTIFY_OK; 4676 return NOTIFY_OK;
@@ -3525,14 +4684,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3525 unsigned long action, 4684 unsigned long action,
3526 void *hcpu) 4685 void *hcpu)
3527{ 4686{
3528 unsigned int cpu = (unsigned long)hcpu; 4687 int cpu = (unsigned long)hcpu;
3529 struct work_struct unbind_work; 4688 struct work_struct unbind_work;
4689 struct workqueue_struct *wq;
3530 4690
3531 switch (action & ~CPU_TASKS_FROZEN) { 4691 switch (action & ~CPU_TASKS_FROZEN) {
3532 case CPU_DOWN_PREPARE: 4692 case CPU_DOWN_PREPARE:
3533 /* unbinding should happen on the local CPU */ 4693 /* unbinding per-cpu workers should happen on the local CPU */
3534 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); 4694 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3535 queue_work_on(cpu, system_highpri_wq, &unbind_work); 4695 queue_work_on(cpu, system_highpri_wq, &unbind_work);
4696
4697 /* update NUMA affinity of unbound workqueues */
4698 mutex_lock(&wq_pool_mutex);
4699 list_for_each_entry(wq, &workqueues, list)
4700 wq_update_unbound_numa(wq, cpu, false);
4701 mutex_unlock(&wq_pool_mutex);
4702
4703 /* wait for per-cpu unbinding to finish */
3536 flush_work(&unbind_work); 4704 flush_work(&unbind_work);
3537 break; 4705 break;
3538 } 4706 }
@@ -3565,7 +4733,7 @@ static void work_for_cpu_fn(struct work_struct *work)
3565 * It is up to the caller to ensure that the cpu doesn't go offline. 4733 * It is up to the caller to ensure that the cpu doesn't go offline.
3566 * The caller must not hold any locks which would prevent @fn from completing. 4734 * The caller must not hold any locks which would prevent @fn from completing.
3567 */ 4735 */
3568long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 4736long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
3569{ 4737{
3570 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 4738 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3571 4739
@@ -3583,44 +4751,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3583 * freeze_workqueues_begin - begin freezing workqueues 4751 * freeze_workqueues_begin - begin freezing workqueues
3584 * 4752 *
3585 * Start freezing workqueues. After this function returns, all freezable 4753 * Start freezing workqueues. After this function returns, all freezable
3586 * workqueues will queue new works to their frozen_works list instead of 4754 * workqueues will queue new works to their delayed_works list instead of
3587 * pool->worklist. 4755 * pool->worklist.
3588 * 4756 *
3589 * CONTEXT: 4757 * CONTEXT:
3590 * Grabs and releases workqueue_lock and pool->lock's. 4758 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
3591 */ 4759 */
3592void freeze_workqueues_begin(void) 4760void freeze_workqueues_begin(void)
3593{ 4761{
3594 unsigned int cpu; 4762 struct worker_pool *pool;
4763 struct workqueue_struct *wq;
4764 struct pool_workqueue *pwq;
4765 int pi;
3595 4766
3596 spin_lock(&workqueue_lock); 4767 mutex_lock(&wq_pool_mutex);
3597 4768
3598 BUG_ON(workqueue_freezing); 4769 WARN_ON_ONCE(workqueue_freezing);
3599 workqueue_freezing = true; 4770 workqueue_freezing = true;
3600 4771
3601 for_each_wq_cpu(cpu) { 4772 /* set FREEZING */
3602 struct worker_pool *pool; 4773 for_each_pool(pool, pi) {
3603 struct workqueue_struct *wq; 4774 spin_lock_irq(&pool->lock);
3604 4775 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3605 for_each_std_worker_pool(pool, cpu) { 4776 pool->flags |= POOL_FREEZING;
3606 spin_lock_irq(&pool->lock); 4777 spin_unlock_irq(&pool->lock);
3607 4778 }
3608 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3609 pool->flags |= POOL_FREEZING;
3610
3611 list_for_each_entry(wq, &workqueues, list) {
3612 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3613
3614 if (pwq && pwq->pool == pool &&
3615 (wq->flags & WQ_FREEZABLE))
3616 pwq->max_active = 0;
3617 }
3618 4779
3619 spin_unlock_irq(&pool->lock); 4780 list_for_each_entry(wq, &workqueues, list) {
3620 } 4781 mutex_lock(&wq->mutex);
4782 for_each_pwq(pwq, wq)
4783 pwq_adjust_max_active(pwq);
4784 mutex_unlock(&wq->mutex);
3621 } 4785 }
3622 4786
3623 spin_unlock(&workqueue_lock); 4787 mutex_unlock(&wq_pool_mutex);
3624} 4788}
3625 4789
3626/** 4790/**
@@ -3630,7 +4794,7 @@ void freeze_workqueues_begin(void)
3630 * between freeze_workqueues_begin() and thaw_workqueues(). 4794 * between freeze_workqueues_begin() and thaw_workqueues().
3631 * 4795 *
3632 * CONTEXT: 4796 * CONTEXT:
3633 * Grabs and releases workqueue_lock. 4797 * Grabs and releases wq_pool_mutex.
3634 * 4798 *
3635 * RETURNS: 4799 * RETURNS:
3636 * %true if some freezable workqueues are still busy. %false if freezing 4800 * %true if some freezable workqueues are still busy. %false if freezing
@@ -3638,34 +4802,34 @@ void freeze_workqueues_begin(void)
3638 */ 4802 */
3639bool freeze_workqueues_busy(void) 4803bool freeze_workqueues_busy(void)
3640{ 4804{
3641 unsigned int cpu;
3642 bool busy = false; 4805 bool busy = false;
4806 struct workqueue_struct *wq;
4807 struct pool_workqueue *pwq;
3643 4808
3644 spin_lock(&workqueue_lock); 4809 mutex_lock(&wq_pool_mutex);
3645 4810
3646 BUG_ON(!workqueue_freezing); 4811 WARN_ON_ONCE(!workqueue_freezing);
3647 4812
3648 for_each_wq_cpu(cpu) { 4813 list_for_each_entry(wq, &workqueues, list) {
3649 struct workqueue_struct *wq; 4814 if (!(wq->flags & WQ_FREEZABLE))
4815 continue;
3650 /* 4816 /*
3651 * nr_active is monotonically decreasing. It's safe 4817 * nr_active is monotonically decreasing. It's safe
3652 * to peek without lock. 4818 * to peek without lock.
3653 */ 4819 */
3654 list_for_each_entry(wq, &workqueues, list) { 4820 rcu_read_lock_sched();
3655 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4821 for_each_pwq(pwq, wq) {
3656 4822 WARN_ON_ONCE(pwq->nr_active < 0);
3657 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3658 continue;
3659
3660 BUG_ON(pwq->nr_active < 0);
3661 if (pwq->nr_active) { 4823 if (pwq->nr_active) {
3662 busy = true; 4824 busy = true;
4825 rcu_read_unlock_sched();
3663 goto out_unlock; 4826 goto out_unlock;
3664 } 4827 }
3665 } 4828 }
4829 rcu_read_unlock_sched();
3666 } 4830 }
3667out_unlock: 4831out_unlock:
3668 spin_unlock(&workqueue_lock); 4832 mutex_unlock(&wq_pool_mutex);
3669 return busy; 4833 return busy;
3670} 4834}
3671 4835
@@ -3676,104 +4840,142 @@ out_unlock:
3676 * frozen works are transferred to their respective pool worklists. 4840 * frozen works are transferred to their respective pool worklists.
3677 * 4841 *
3678 * CONTEXT: 4842 * CONTEXT:
3679 * Grabs and releases workqueue_lock and pool->lock's. 4843 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
3680 */ 4844 */
3681void thaw_workqueues(void) 4845void thaw_workqueues(void)
3682{ 4846{
3683 unsigned int cpu; 4847 struct workqueue_struct *wq;
4848 struct pool_workqueue *pwq;
4849 struct worker_pool *pool;
4850 int pi;
3684 4851
3685 spin_lock(&workqueue_lock); 4852 mutex_lock(&wq_pool_mutex);
3686 4853
3687 if (!workqueue_freezing) 4854 if (!workqueue_freezing)
3688 goto out_unlock; 4855 goto out_unlock;
3689 4856
3690 for_each_wq_cpu(cpu) { 4857 /* clear FREEZING */
3691 struct worker_pool *pool; 4858 for_each_pool(pool, pi) {
3692 struct workqueue_struct *wq; 4859 spin_lock_irq(&pool->lock);
4860 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
4861 pool->flags &= ~POOL_FREEZING;
4862 spin_unlock_irq(&pool->lock);
4863 }
3693 4864
3694 for_each_std_worker_pool(pool, cpu) { 4865 /* restore max_active and repopulate worklist */
3695 spin_lock_irq(&pool->lock); 4866 list_for_each_entry(wq, &workqueues, list) {
4867 mutex_lock(&wq->mutex);
4868 for_each_pwq(pwq, wq)
4869 pwq_adjust_max_active(pwq);
4870 mutex_unlock(&wq->mutex);
4871 }
3696 4872
3697 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); 4873 workqueue_freezing = false;
3698 pool->flags &= ~POOL_FREEZING; 4874out_unlock:
4875 mutex_unlock(&wq_pool_mutex);
4876}
4877#endif /* CONFIG_FREEZER */
3699 4878
3700 list_for_each_entry(wq, &workqueues, list) { 4879static void __init wq_numa_init(void)
3701 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4880{
4881 cpumask_var_t *tbl;
4882 int node, cpu;
3702 4883
3703 if (!pwq || pwq->pool != pool || 4884 /* determine NUMA pwq table len - highest node id + 1 */
3704 !(wq->flags & WQ_FREEZABLE)) 4885 for_each_node(node)
3705 continue; 4886 wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
3706 4887
3707 /* restore max_active and repopulate worklist */ 4888 if (num_possible_nodes() <= 1)
3708 pwq_set_max_active(pwq, wq->saved_max_active); 4889 return;
3709 }
3710 4890
3711 wake_up_worker(pool); 4891 if (wq_disable_numa) {
4892 pr_info("workqueue: NUMA affinity support disabled\n");
4893 return;
4894 }
3712 4895
3713 spin_unlock_irq(&pool->lock); 4896 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
4897 BUG_ON(!wq_update_unbound_numa_attrs_buf);
4898
4899 /*
4900 * We want masks of possible CPUs of each node which isn't readily
4901 * available. Build one from cpu_to_node() which should have been
4902 * fully initialized by now.
4903 */
4904 tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
4905 BUG_ON(!tbl);
4906
4907 for_each_node(node)
4908 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
4909 node_online(node) ? node : NUMA_NO_NODE));
4910
4911 for_each_possible_cpu(cpu) {
4912 node = cpu_to_node(cpu);
4913 if (WARN_ON(node == NUMA_NO_NODE)) {
4914 pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
4915 /* happens iff arch is bonkers, let's just proceed */
4916 return;
3714 } 4917 }
4918 cpumask_set_cpu(cpu, tbl[node]);
3715 } 4919 }
3716 4920
3717 workqueue_freezing = false; 4921 wq_numa_possible_cpumask = tbl;
3718out_unlock: 4922 wq_numa_enabled = true;
3719 spin_unlock(&workqueue_lock);
3720} 4923}
3721#endif /* CONFIG_FREEZER */
3722 4924
3723static int __init init_workqueues(void) 4925static int __init init_workqueues(void)
3724{ 4926{
3725 unsigned int cpu; 4927 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
4928 int i, cpu;
3726 4929
3727 /* make sure we have enough bits for OFFQ pool ID */ 4930 /* make sure we have enough bits for OFFQ pool ID */
3728 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < 4931 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3729 WORK_CPU_END * NR_STD_WORKER_POOLS); 4932 WORK_CPU_END * NR_STD_WORKER_POOLS);
3730 4933
4934 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
4935
4936 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
4937
3731 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 4938 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3732 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 4939 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3733 4940
4941 wq_numa_init();
4942
3734 /* initialize CPU pools */ 4943 /* initialize CPU pools */
3735 for_each_wq_cpu(cpu) { 4944 for_each_possible_cpu(cpu) {
3736 struct worker_pool *pool; 4945 struct worker_pool *pool;
3737 4946
3738 for_each_std_worker_pool(pool, cpu) { 4947 i = 0;
3739 spin_lock_init(&pool->lock); 4948 for_each_cpu_worker_pool(pool, cpu) {
4949 BUG_ON(init_worker_pool(pool));
3740 pool->cpu = cpu; 4950 pool->cpu = cpu;
3741 pool->flags |= POOL_DISASSOCIATED; 4951 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
3742 INIT_LIST_HEAD(&pool->worklist); 4952 pool->attrs->nice = std_nice[i++];
3743 INIT_LIST_HEAD(&pool->idle_list); 4953 pool->node = cpu_to_node(cpu);
3744 hash_init(pool->busy_hash);
3745
3746 init_timer_deferrable(&pool->idle_timer);
3747 pool->idle_timer.function = idle_worker_timeout;
3748 pool->idle_timer.data = (unsigned long)pool;
3749
3750 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3751 (unsigned long)pool);
3752
3753 mutex_init(&pool->assoc_mutex);
3754 ida_init(&pool->worker_ida);
3755 4954
3756 /* alloc pool ID */ 4955 /* alloc pool ID */
4956 mutex_lock(&wq_pool_mutex);
3757 BUG_ON(worker_pool_assign_id(pool)); 4957 BUG_ON(worker_pool_assign_id(pool));
4958 mutex_unlock(&wq_pool_mutex);
3758 } 4959 }
3759 } 4960 }
3760 4961
3761 /* create the initial worker */ 4962 /* create the initial worker */
3762 for_each_online_wq_cpu(cpu) { 4963 for_each_online_cpu(cpu) {
3763 struct worker_pool *pool; 4964 struct worker_pool *pool;
3764 4965
3765 for_each_std_worker_pool(pool, cpu) { 4966 for_each_cpu_worker_pool(pool, cpu) {
3766 struct worker *worker; 4967 pool->flags &= ~POOL_DISASSOCIATED;
4968 BUG_ON(create_and_start_worker(pool) < 0);
4969 }
4970 }
3767 4971
3768 if (cpu != WORK_CPU_UNBOUND) 4972 /* create default unbound wq attrs */
3769 pool->flags &= ~POOL_DISASSOCIATED; 4973 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
4974 struct workqueue_attrs *attrs;
3770 4975
3771 worker = create_worker(pool); 4976 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
3772 BUG_ON(!worker); 4977 attrs->nice = std_nice[i];
3773 spin_lock_irq(&pool->lock); 4978 unbound_std_wq_attrs[i] = attrs;
3774 start_worker(worker);
3775 spin_unlock_irq(&pool->lock);
3776 }
3777 } 4979 }
3778 4980
3779 system_wq = alloc_workqueue("events", 0, 0); 4981 system_wq = alloc_workqueue("events", 0, 0);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 07650264ec15..ad83c96b2ece 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -29,16 +29,24 @@ struct worker {
29 struct work_struct *current_work; /* L: work being processed */ 29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */ 30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */ 31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 bool desc_valid; /* ->desc is valid */
32 struct list_head scheduled; /* L: scheduled works */ 33 struct list_head scheduled; /* L: scheduled works */
34
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36
33 struct task_struct *task; /* I: worker task */ 37 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */ 38 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */ 39 /* L: for rescuers */
40
36 unsigned long last_active; /* L: last active timestamp */ 41 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */ 42 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */ 43 int id; /* I: worker id */
39 44
40 /* for rebinding worker to CPU */ 45 /*
41 struct work_struct rebind_work; /* L: for busy worker */ 46 * Opaque string set with work_set_desc(). Printed out with task
47 * dump for debugging - WARN, BUG, panic or sysrq.
48 */
49 char desc[WORKER_DESC_LEN];
42 50
43 /* used only by rescuers to point to the target workqueue */ 51 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ 52 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
@@ -58,8 +66,7 @@ static inline struct worker *current_wq_worker(void)
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from 66 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c. 67 * sched.c and workqueue.c.
60 */ 68 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); 69void wq_worker_waking_up(struct task_struct *task, int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task, 70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
63 unsigned int cpu);
64 71
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ 72#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */