aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-07 02:19:51 -0400
committerIngo Molnar <mingo@elte.hu>2009-09-07 02:19:51 -0400
commita1922ed661ab2c1637d0b10cde933bd9cd33d965 (patch)
tree0f1777542b385ebefd30b3586d830fd8ed6fda5b /kernel
parent75e33751ca8bbb72dd6f1a74d2810ddc8cbe4bdf (diff)
parentd28daf923ac5e4a0d7cecebae56f3e339189366b (diff)
Merge branch 'tracing/core' into tracing/hw-breakpoints
Conflicts: arch/Kconfig kernel/trace/trace.h Merge reason: resolve the conflicts, plus adopt to the new ring-buffer APIs. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c66
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c165
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/exit.c294
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c74
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/hrtimer.c112
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c84
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c38
-rw-r--r--kernel/kthread.c90
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c46
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c1396
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/pid_namespace.c24
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c163
-rw-r--r--kernel/rcutree.c3
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c80
-rw-r--r--kernel/sched_cpupri.c17
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c48
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c37
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c65
-rw-r--r--kernel/sysctl.c24
-rw-r--r--kernel/time/clockevents.c27
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c4
-rw-r--r--kernel/trace/Kconfig27
-rw-r--r--kernel/trace/blktrace.c25
-rw-r--r--kernel/trace/ftrace.c215
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/ring_buffer.c1437
-rw-r--r--kernel/trace/ring_buffer_benchmark.c45
-rw-r--r--kernel/trace/trace.c736
-rw-r--r--kernel/trace/trace.h87
-rw-r--r--kernel/trace/trace_boot.c16
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c178
-rw-r--r--kernel/trace/trace_events_filter.c292
-rw-r--r--kernel/trace/trace_export.c28
-rw-r--r--kernel/trace/trace_functions.c17
-rw-r--r--kernel/trace/trace_functions_graph.c213
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_power.c22
-rw-r--r--kernel/trace/trace_printk.c28
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c54
-rw-r--r--kernel/trace/trace_stat.c53
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c471
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c50
-rw-r--r--kernel/utsname.c13
-rw-r--r--kernel/wait.c5
94 files changed, 6975 insertions, 3141 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f88decb1b445..52508612a08f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,8 +69,9 @@ obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/
74obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
75obj-$(CONFIG_KPROBES) += kprobes.o 76obj-$(CONFIG_KPROBES) += kprobes.o
76obj-$(CONFIG_KGDB) += kgdb.o 77obj-$(CONFIG_KGDB) += kgdb.o
@@ -95,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
95obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
96obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
97obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/
98obj-$(CONFIG_SMP) += sched_cpupri.o 100obj-$(CONFIG_SMP) += sched_cpupri.o
99obj-$(CONFIG_SLOW_WORK) += slow-work.o 101obj-$(CONFIG_SLOW_WORK) += slow-work.o
100obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 102obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fb789f6df94..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 736 * to zero, soon.
736 * 737 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 739 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 741
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 743{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
745} 746}
746 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
747static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 761 unsigned long final_bits)
749{ 762{
@@ -843,6 +856,11 @@ static int parse_cgroupfs_options(char *data,
843 struct cgroup_sb_opts *opts) 856 struct cgroup_sb_opts *opts)
844{ 857{
845 char *token, *o = data ?: "all"; 858 char *token, *o = data ?: "all";
859 unsigned long mask = (unsigned long)-1;
860
861#ifdef CONFIG_CPUSETS
862 mask = ~(1UL << cpuset_subsys_id);
863#endif
846 864
847 opts->subsys_bits = 0; 865 opts->subsys_bits = 0;
848 opts->flags = 0; 866 opts->flags = 0;
@@ -887,6 +905,15 @@ static int parse_cgroupfs_options(char *data,
887 } 905 }
888 } 906 }
889 907
908 /*
909 * Option noprefix was introduced just for backward compatibility
910 * with the old cpuset, so we allow noprefix only if mounting just
911 * the cpuset subsystem.
912 */
913 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
914 (opts->subsys_bits & mask))
915 return -EINVAL;
916
890 /* We can't have an empty hierarchy */ 917 /* We can't have an empty hierarchy */
891 if (!opts->subsys_bits) 918 if (!opts->subsys_bits)
892 return -EINVAL; 919 return -EINVAL;
@@ -946,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
946 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
947 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
948 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
949 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
950} 978}
951static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1343,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1343 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1344 * is no longer empty. 1372 * is no longer empty.
1345 */ 1373 */
1346 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1347 return 0; 1375 return 0;
1348} 1376}
1349 1377
@@ -2187,12 +2215,30 @@ err:
2187 return ret; 2215 return ret;
2188} 2216}
2189 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2190static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2191{ 2238{
2192 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2193} 2240}
2194 2241
2195
2196/* 2242/*
2197 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2198 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2207,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2207 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2208 * next pid to display, if any 2254 * next pid to display, if any
2209 */ 2255 */
2210 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2211 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2212 int *iter; 2259 int *iter;
2213 2260
2214 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2215 if (pid) { 2262 if (pid) {
2216 int end = cgrp->pids_length; 2263 int end = cp->length;
2217 2264
2218 while (index < end) { 2265 while (index < end) {
2219 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2220 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2221 index = mid; 2268 index = mid;
2222 break; 2269 break;
2223 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2224 index = mid + 1; 2271 index = mid + 1;
2225 else 2272 else
2226 end = mid; 2273 end = mid;
2227 } 2274 }
2228 } 2275 }
2229 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2230 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2231 return NULL; 2278 return NULL;
2232 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2233 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2234 *pos = *iter; 2281 *pos = *iter;
2235 return iter; 2282 return iter;
2236} 2283}
2237 2284
2238static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2239{ 2286{
2240 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2241 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2242} 2290}
2243 2291
2244static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2245{ 2293{
2246 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2247 int *p = v; 2295 int *p = v;
2248 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2249 2297
2250 /* 2298 /*
2251 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2272,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2272 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2273}; 2321};
2274 2322
2275static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2276{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2277 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2278 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2279 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2280 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2281 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2282 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2283 } 2334 }
2284 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2285} 2336}
2286 2337
2287static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2288{ 2339{
2289 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2290 2342
2291 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2292 return 0; 2344 return 0;
2293 2345
2294 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2295 return seq_release(inode, file); 2350 return seq_release(inode, file);
2296} 2351}
2297 2352
@@ -2310,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2310static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2311{ 2366{
2312 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2313 pid_t *pidarray; 2370 pid_t *pidarray;
2314 int npids; 2371 int npids;
2315 int retval; 2372 int retval;
@@ -2336,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2336 * array if necessary 2393 * array if necessary
2337 */ 2394 */
2338 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2339 kfree(cgrp->tasks_pids); 2396
2340 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2341 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2342 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2343 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2344 2418
2345 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2346 2420
2347 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2348 if (retval) { 2422 if (retval) {
2349 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2350 return retval; 2424 return retval;
2351 } 2425 }
2352 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2353 return 0; 2427 return 0;
2354} 2428}
2355 2429
@@ -2682,33 +2756,42 @@ again:
2682 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2683 2757
2684 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2685 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2686 * that rmdir() request comes. 2771 * that rmdir() request comes.
2687 */ 2772 */
2688 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2689 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2690 return ret; 2776 return ret;
2777 }
2691 2778
2692 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2693 parent = cgrp->parent; 2780 parent = cgrp->parent;
2694 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2695 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2696 return -EBUSY; 2784 return -EBUSY;
2697 } 2785 }
2698 /*
2699 * css_put/get is provided for subsys to grab refcnt to css. In typical
2700 * case, subsystem has no reference after pre_destroy(). But, under
2701 * hierarchy management, some *temporal* refcnt can be hold.
2702 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2703 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2704 * is called when css_put() is called and refcnt goes down to 0.
2705 */
2706 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2707 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2708
2709 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2710 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2711 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2712 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2713 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2714 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3280,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3280 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3281 check_for_release(cgrp); 3364 check_for_release(cgrp);
3282 } 3365 }
3283 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3284 } 3367 }
3285 rcu_read_unlock(); 3368 rcu_read_unlock();
3286} 3369}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
34 * an ongoing cpu hotplug operation. 34 * an ongoing cpu hotplug operation.
35 */ 35 */
36 int refcount; 36 int refcount;
37} cpu_hotplug; 37} cpu_hotplug = {
38 38 .active_writer = NULL,
39void __init cpu_hotplug_init(void) 39 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
40{ 40 .refcount = 0,
41 cpu_hotplug.active_writer = NULL; 41};
42 mutex_init(&cpu_hotplug.lock);
43 cpu_hotplug.refcount = 0;
44}
45 42
46#ifdef CONFIG_HOTPLUG_CPU 43#ifdef CONFIG_HOTPLUG_CPU
47 44
diff --git a/kernel/exit.c b/kernel/exit.c
index b6c90b5ef509..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -375,9 +374,8 @@ static void set_special_pids(struct pid *pid)
375} 374}
376 375
377/* 376/*
378 * Let kernel threads use this to say that they 377 * Let kernel threads use this to say that they allow a certain signal.
379 * allow a certain signal (since daemonize() will 378 * Must not be used if kthread was cloned with CLONE_SIGHAND.
380 * have disabled all of them by default).
381 */ 379 */
382int allow_signal(int sig) 380int allow_signal(int sig)
383{ 381{
@@ -385,14 +383,14 @@ int allow_signal(int sig)
385 return -EINVAL; 383 return -EINVAL;
386 384
387 spin_lock_irq(&current->sighand->siglock); 385 spin_lock_irq(&current->sighand->siglock);
386 /* This is only needed for daemonize()'ed kthreads */
388 sigdelset(&current->blocked, sig); 387 sigdelset(&current->blocked, sig);
389 if (!current->mm) { 388 /*
390 /* Kernel threads handle their own signals. 389 * Kernel threads handle their own signals. Let the signal code
391 Let the signal code know it'll be handled, so 390 * know it'll be handled, so that they don't get converted to
392 that they don't get converted to SIGKILL or 391 * SIGKILL or just silently dropped.
393 just silently dropped */ 392 */
394 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 393 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
395 }
396 recalc_sigpending(); 394 recalc_sigpending();
397 spin_unlock_irq(&current->sighand->siglock); 395 spin_unlock_irq(&current->sighand->siglock);
398 return 0; 396 return 0;
@@ -591,7 +589,7 @@ retry:
591 /* 589 /*
592 * Search in the siblings 590 * Search in the siblings
593 */ 591 */
594 list_for_each_entry(c, &p->parent->children, sibling) { 592 list_for_each_entry(c, &p->real_parent->children, sibling) {
595 if (c->mm == mm) 593 if (c->mm == mm)
596 goto assign_new_owner; 594 goto assign_new_owner;
597 } 595 }
@@ -758,7 +756,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
758 p->exit_signal = SIGCHLD; 756 p->exit_signal = SIGCHLD;
759 757
760 /* If it has exited notify the new parent about this child's death. */ 758 /* If it has exited notify the new parent about this child's death. */
761 if (!p->ptrace && 759 if (!task_ptrace(p) &&
762 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 760 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
763 do_notify_parent(p, p->exit_signal); 761 do_notify_parent(p, p->exit_signal);
764 if (task_detached(p)) { 762 if (task_detached(p)) {
@@ -783,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
783 list_for_each_entry_safe(p, n, &father->children, sibling) { 781 list_for_each_entry_safe(p, n, &father->children, sibling) {
784 p->real_parent = reaper; 782 p->real_parent = reaper;
785 if (p->parent == father) { 783 if (p->parent == father) {
786 BUG_ON(p->ptrace); 784 BUG_ON(task_ptrace(p));
787 p->parent = p->real_parent; 785 p->parent = p->real_parent;
788 } 786 }
789 reparent_thread(father, p, &dead_children); 787 reparent_thread(father, p, &dead_children);
@@ -1081,6 +1079,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
1081 return 0; 1079 return 0;
1082} 1080}
1083 1081
1082struct wait_opts {
1083 enum pid_type wo_type;
1084 int wo_flags;
1085 struct pid *wo_pid;
1086
1087 struct siginfo __user *wo_info;
1088 int __user *wo_stat;
1089 struct rusage __user *wo_rusage;
1090
1091 int notask_error;
1092};
1093
1084static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1094static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1085{ 1095{
1086 struct pid *pid = NULL; 1096 struct pid *pid = NULL;
@@ -1091,13 +1101,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1091 return pid; 1101 return pid;
1092} 1102}
1093 1103
1094static int eligible_child(enum pid_type type, struct pid *pid, int options, 1104static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1095 struct task_struct *p)
1096{ 1105{
1097 int err; 1106 int err;
1098 1107
1099 if (type < PIDTYPE_MAX) { 1108 if (wo->wo_type < PIDTYPE_MAX) {
1100 if (task_pid_type(p, type) != pid) 1109 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1101 return 0; 1110 return 0;
1102 } 1111 }
1103 1112
@@ -1106,8 +1115,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1106 * set; otherwise, wait for non-clone children *only*. (Note: 1115 * set; otherwise, wait for non-clone children *only*. (Note:
1107 * A "clone" child here is one that reports to its parent 1116 * A "clone" child here is one that reports to its parent
1108 * using a signal other than SIGCHLD.) */ 1117 * using a signal other than SIGCHLD.) */
1109 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) 1118 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1110 && !(options & __WALL)) 1119 && !(wo->wo_flags & __WALL))
1111 return 0; 1120 return 0;
1112 1121
1113 err = security_task_wait(p); 1122 err = security_task_wait(p);
@@ -1117,14 +1126,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1117 return 1; 1126 return 1;
1118} 1127}
1119 1128
1120static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1129static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1121 int why, int status, 1130 pid_t pid, uid_t uid, int why, int status)
1122 struct siginfo __user *infop,
1123 struct rusage __user *rusagep)
1124{ 1131{
1125 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1132 struct siginfo __user *infop;
1133 int retval = wo->wo_rusage
1134 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1126 1135
1127 put_task_struct(p); 1136 put_task_struct(p);
1137 infop = wo->wo_info;
1128 if (!retval) 1138 if (!retval)
1129 retval = put_user(SIGCHLD, &infop->si_signo); 1139 retval = put_user(SIGCHLD, &infop->si_signo);
1130 if (!retval) 1140 if (!retval)
@@ -1148,19 +1158,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1148 * the lock and this task is uninteresting. If we return nonzero, we have 1158 * the lock and this task is uninteresting. If we return nonzero, we have
1149 * released the lock and the system call should return. 1159 * released the lock and the system call should return.
1150 */ 1160 */
1151static int wait_task_zombie(struct task_struct *p, int options, 1161static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1152 struct siginfo __user *infop,
1153 int __user *stat_addr, struct rusage __user *ru)
1154{ 1162{
1155 unsigned long state; 1163 unsigned long state;
1156 int retval, status, traced; 1164 int retval, status, traced;
1157 pid_t pid = task_pid_vnr(p); 1165 pid_t pid = task_pid_vnr(p);
1158 uid_t uid = __task_cred(p)->uid; 1166 uid_t uid = __task_cred(p)->uid;
1167 struct siginfo __user *infop;
1159 1168
1160 if (!likely(options & WEXITED)) 1169 if (!likely(wo->wo_flags & WEXITED))
1161 return 0; 1170 return 0;
1162 1171
1163 if (unlikely(options & WNOWAIT)) { 1172 if (unlikely(wo->wo_flags & WNOWAIT)) {
1164 int exit_code = p->exit_code; 1173 int exit_code = p->exit_code;
1165 int why, status; 1174 int why, status;
1166 1175
@@ -1173,8 +1182,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1173 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1182 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1174 status = exit_code & 0x7f; 1183 status = exit_code & 0x7f;
1175 } 1184 }
1176 return wait_noreap_copyout(p, pid, uid, why, 1185 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1177 status, infop, ru);
1178 } 1186 }
1179 1187
1180 /* 1188 /*
@@ -1188,11 +1196,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
1188 } 1196 }
1189 1197
1190 traced = ptrace_reparented(p); 1198 traced = ptrace_reparented(p);
1191 1199 /*
1192 if (likely(!traced)) { 1200 * It can be ptraced but not reparented, check
1201 * !task_detached() to filter out sub-threads.
1202 */
1203 if (likely(!traced) && likely(!task_detached(p))) {
1193 struct signal_struct *psig; 1204 struct signal_struct *psig;
1194 struct signal_struct *sig; 1205 struct signal_struct *sig;
1195 struct task_cputime cputime;
1196 1206
1197 /* 1207 /*
1198 * The resource counters for the group leader are in its 1208 * The resource counters for the group leader are in its
@@ -1205,26 +1215,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1205 * p->signal fields, because they are only touched by 1215 * p->signal fields, because they are only touched by
1206 * __exit_signal, which runs with tasklist_lock 1216 * __exit_signal, which runs with tasklist_lock
1207 * write-locked anyway, and so is excluded here. We do 1217 * write-locked anyway, and so is excluded here. We do
1208 * need to protect the access to p->parent->signal fields, 1218 * need to protect the access to parent->signal fields,
1209 * as other threads in the parent group can be right 1219 * as other threads in the parent group can be right
1210 * here reaping other children at the same time. 1220 * here reaping other children at the same time.
1211 *
1212 * We use thread_group_cputime() to get times for the thread
1213 * group, which consolidates times for all threads in the
1214 * group including the group leader.
1215 */ 1221 */
1216 thread_group_cputime(p, &cputime); 1222 spin_lock_irq(&p->real_parent->sighand->siglock);
1217 spin_lock_irq(&p->parent->sighand->siglock); 1223 psig = p->real_parent->signal;
1218 psig = p->parent->signal;
1219 sig = p->signal; 1224 sig = p->signal;
1220 psig->cutime = 1225 psig->cutime =
1221 cputime_add(psig->cutime, 1226 cputime_add(psig->cutime,
1222 cputime_add(cputime.utime, 1227 cputime_add(p->utime,
1223 sig->cutime)); 1228 cputime_add(sig->utime,
1229 sig->cutime)));
1224 psig->cstime = 1230 psig->cstime =
1225 cputime_add(psig->cstime, 1231 cputime_add(psig->cstime,
1226 cputime_add(cputime.stime, 1232 cputime_add(p->stime,
1227 sig->cstime)); 1233 cputime_add(sig->stime,
1234 sig->cstime)));
1228 psig->cgtime = 1235 psig->cgtime =
1229 cputime_add(psig->cgtime, 1236 cputime_add(psig->cgtime,
1230 cputime_add(p->gtime, 1237 cputime_add(p->gtime,
@@ -1246,7 +1253,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1246 sig->oublock + sig->coublock; 1253 sig->oublock + sig->coublock;
1247 task_io_accounting_add(&psig->ioac, &p->ioac); 1254 task_io_accounting_add(&psig->ioac, &p->ioac);
1248 task_io_accounting_add(&psig->ioac, &sig->ioac); 1255 task_io_accounting_add(&psig->ioac, &sig->ioac);
1249 spin_unlock_irq(&p->parent->sighand->siglock); 1256 spin_unlock_irq(&p->real_parent->sighand->siglock);
1250 } 1257 }
1251 1258
1252 /* 1259 /*
@@ -1255,11 +1262,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
1255 */ 1262 */
1256 read_unlock(&tasklist_lock); 1263 read_unlock(&tasklist_lock);
1257 1264
1258 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1265 retval = wo->wo_rusage
1266 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1259 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1267 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1260 ? p->signal->group_exit_code : p->exit_code; 1268 ? p->signal->group_exit_code : p->exit_code;
1261 if (!retval && stat_addr) 1269 if (!retval && wo->wo_stat)
1262 retval = put_user(status, stat_addr); 1270 retval = put_user(status, wo->wo_stat);
1271
1272 infop = wo->wo_info;
1263 if (!retval && infop) 1273 if (!retval && infop)
1264 retval = put_user(SIGCHLD, &infop->si_signo); 1274 retval = put_user(SIGCHLD, &infop->si_signo);
1265 if (!retval && infop) 1275 if (!retval && infop)
@@ -1327,15 +1337,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1327 * the lock and this task is uninteresting. If we return nonzero, we have 1337 * the lock and this task is uninteresting. If we return nonzero, we have
1328 * released the lock and the system call should return. 1338 * released the lock and the system call should return.
1329 */ 1339 */
1330static int wait_task_stopped(int ptrace, struct task_struct *p, 1340static int wait_task_stopped(struct wait_opts *wo,
1331 int options, struct siginfo __user *infop, 1341 int ptrace, struct task_struct *p)
1332 int __user *stat_addr, struct rusage __user *ru)
1333{ 1342{
1343 struct siginfo __user *infop;
1334 int retval, exit_code, *p_code, why; 1344 int retval, exit_code, *p_code, why;
1335 uid_t uid = 0; /* unneeded, required by compiler */ 1345 uid_t uid = 0; /* unneeded, required by compiler */
1336 pid_t pid; 1346 pid_t pid;
1337 1347
1338 if (!(options & WUNTRACED)) 1348 /*
1349 * Traditionally we see ptrace'd stopped tasks regardless of options.
1350 */
1351 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1339 return 0; 1352 return 0;
1340 1353
1341 exit_code = 0; 1354 exit_code = 0;
@@ -1349,7 +1362,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1349 if (!exit_code) 1362 if (!exit_code)
1350 goto unlock_sig; 1363 goto unlock_sig;
1351 1364
1352 if (!unlikely(options & WNOWAIT)) 1365 if (!unlikely(wo->wo_flags & WNOWAIT))
1353 *p_code = 0; 1366 *p_code = 0;
1354 1367
1355 /* don't need the RCU readlock here as we're holding a spinlock */ 1368 /* don't need the RCU readlock here as we're holding a spinlock */
@@ -1371,14 +1384,15 @@ unlock_sig:
1371 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1384 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1372 read_unlock(&tasklist_lock); 1385 read_unlock(&tasklist_lock);
1373 1386
1374 if (unlikely(options & WNOWAIT)) 1387 if (unlikely(wo->wo_flags & WNOWAIT))
1375 return wait_noreap_copyout(p, pid, uid, 1388 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1376 why, exit_code,
1377 infop, ru);
1378 1389
1379 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1390 retval = wo->wo_rusage
1380 if (!retval && stat_addr) 1391 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1381 retval = put_user((exit_code << 8) | 0x7f, stat_addr); 1392 if (!retval && wo->wo_stat)
1393 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1394
1395 infop = wo->wo_info;
1382 if (!retval && infop) 1396 if (!retval && infop)
1383 retval = put_user(SIGCHLD, &infop->si_signo); 1397 retval = put_user(SIGCHLD, &infop->si_signo);
1384 if (!retval && infop) 1398 if (!retval && infop)
@@ -1405,15 +1419,13 @@ unlock_sig:
1405 * the lock and this task is uninteresting. If we return nonzero, we have 1419 * the lock and this task is uninteresting. If we return nonzero, we have
1406 * released the lock and the system call should return. 1420 * released the lock and the system call should return.
1407 */ 1421 */
1408static int wait_task_continued(struct task_struct *p, int options, 1422static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1409 struct siginfo __user *infop,
1410 int __user *stat_addr, struct rusage __user *ru)
1411{ 1423{
1412 int retval; 1424 int retval;
1413 pid_t pid; 1425 pid_t pid;
1414 uid_t uid; 1426 uid_t uid;
1415 1427
1416 if (!unlikely(options & WCONTINUED)) 1428 if (!unlikely(wo->wo_flags & WCONTINUED))
1417 return 0; 1429 return 0;
1418 1430
1419 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1431 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1425,7 +1437,7 @@ static int wait_task_continued(struct task_struct *p, int options,
1425 spin_unlock_irq(&p->sighand->siglock); 1437 spin_unlock_irq(&p->sighand->siglock);
1426 return 0; 1438 return 0;
1427 } 1439 }
1428 if (!unlikely(options & WNOWAIT)) 1440 if (!unlikely(wo->wo_flags & WNOWAIT))
1429 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1441 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1430 uid = __task_cred(p)->uid; 1442 uid = __task_cred(p)->uid;
1431 spin_unlock_irq(&p->sighand->siglock); 1443 spin_unlock_irq(&p->sighand->siglock);
@@ -1434,17 +1446,17 @@ static int wait_task_continued(struct task_struct *p, int options,
1434 get_task_struct(p); 1446 get_task_struct(p);
1435 read_unlock(&tasklist_lock); 1447 read_unlock(&tasklist_lock);
1436 1448
1437 if (!infop) { 1449 if (!wo->wo_info) {
1438 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1450 retval = wo->wo_rusage
1451 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1439 put_task_struct(p); 1452 put_task_struct(p);
1440 if (!retval && stat_addr) 1453 if (!retval && wo->wo_stat)
1441 retval = put_user(0xffff, stat_addr); 1454 retval = put_user(0xffff, wo->wo_stat);
1442 if (!retval) 1455 if (!retval)
1443 retval = pid; 1456 retval = pid;
1444 } else { 1457 } else {
1445 retval = wait_noreap_copyout(p, pid, uid, 1458 retval = wait_noreap_copyout(wo, p, pid, uid,
1446 CLD_CONTINUED, SIGCONT, 1459 CLD_CONTINUED, SIGCONT);
1447 infop, ru);
1448 BUG_ON(retval == 0); 1460 BUG_ON(retval == 0);
1449 } 1461 }
1450 1462
@@ -1454,19 +1466,16 @@ static int wait_task_continued(struct task_struct *p, int options,
1454/* 1466/*
1455 * Consider @p for a wait by @parent. 1467 * Consider @p for a wait by @parent.
1456 * 1468 *
1457 * -ECHILD should be in *@notask_error before the first call. 1469 * -ECHILD should be in ->notask_error before the first call.
1458 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1470 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1459 * Returns zero if the search for a child should continue; 1471 * Returns zero if the search for a child should continue;
1460 * then *@notask_error is 0 if @p is an eligible child, 1472 * then ->notask_error is 0 if @p is an eligible child,
1461 * or another error from security_task_wait(), or still -ECHILD. 1473 * or another error from security_task_wait(), or still -ECHILD.
1462 */ 1474 */
1463static int wait_consider_task(struct task_struct *parent, int ptrace, 1475static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
1464 struct task_struct *p, int *notask_error, 1476 int ptrace, struct task_struct *p)
1465 enum pid_type type, struct pid *pid, int options,
1466 struct siginfo __user *infop,
1467 int __user *stat_addr, struct rusage __user *ru)
1468{ 1477{
1469 int ret = eligible_child(type, pid, options, p); 1478 int ret = eligible_child(wo, p);
1470 if (!ret) 1479 if (!ret)
1471 return ret; 1480 return ret;
1472 1481
@@ -1478,17 +1487,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1478 * to look for security policy problems, rather 1487 * to look for security policy problems, rather
1479 * than for mysterious wait bugs. 1488 * than for mysterious wait bugs.
1480 */ 1489 */
1481 if (*notask_error) 1490 if (wo->notask_error)
1482 *notask_error = ret; 1491 wo->notask_error = ret;
1483 return 0; 1492 return 0;
1484 } 1493 }
1485 1494
1486 if (likely(!ptrace) && unlikely(p->ptrace)) { 1495 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1487 /* 1496 /*
1488 * This child is hidden by ptrace. 1497 * This child is hidden by ptrace.
1489 * We aren't allowed to see it now, but eventually we will. 1498 * We aren't allowed to see it now, but eventually we will.
1490 */ 1499 */
1491 *notask_error = 0; 1500 wo->notask_error = 0;
1492 return 0; 1501 return 0;
1493 } 1502 }
1494 1503
@@ -1499,34 +1508,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1499 * We don't reap group leaders with subthreads. 1508 * We don't reap group leaders with subthreads.
1500 */ 1509 */
1501 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1510 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1502 return wait_task_zombie(p, options, infop, stat_addr, ru); 1511 return wait_task_zombie(wo, p);
1503 1512
1504 /* 1513 /*
1505 * It's stopped or running now, so it might 1514 * It's stopped or running now, so it might
1506 * later continue, exit, or stop again. 1515 * later continue, exit, or stop again.
1507 */ 1516 */
1508 *notask_error = 0; 1517 wo->notask_error = 0;
1509 1518
1510 if (task_stopped_code(p, ptrace)) 1519 if (task_stopped_code(p, ptrace))
1511 return wait_task_stopped(ptrace, p, options, 1520 return wait_task_stopped(wo, ptrace, p);
1512 infop, stat_addr, ru);
1513 1521
1514 return wait_task_continued(p, options, infop, stat_addr, ru); 1522 return wait_task_continued(wo, p);
1515} 1523}
1516 1524
1517/* 1525/*
1518 * Do the work of do_wait() for one thread in the group, @tsk. 1526 * Do the work of do_wait() for one thread in the group, @tsk.
1519 * 1527 *
1520 * -ECHILD should be in *@notask_error before the first call. 1528 * -ECHILD should be in ->notask_error before the first call.
1521 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1529 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1522 * Returns zero if the search for a child should continue; then 1530 * Returns zero if the search for a child should continue; then
1523 * *@notask_error is 0 if there were any eligible children, 1531 * ->notask_error is 0 if there were any eligible children,
1524 * or another error from security_task_wait(), or still -ECHILD. 1532 * or another error from security_task_wait(), or still -ECHILD.
1525 */ 1533 */
1526static int do_wait_thread(struct task_struct *tsk, int *notask_error, 1534static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1527 enum pid_type type, struct pid *pid, int options,
1528 struct siginfo __user *infop, int __user *stat_addr,
1529 struct rusage __user *ru)
1530{ 1535{
1531 struct task_struct *p; 1536 struct task_struct *p;
1532 1537
@@ -1535,9 +1540,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1535 * Do not consider detached threads. 1540 * Do not consider detached threads.
1536 */ 1541 */
1537 if (!task_detached(p)) { 1542 if (!task_detached(p)) {
1538 int ret = wait_consider_task(tsk, 0, p, notask_error, 1543 int ret = wait_consider_task(wo, tsk, 0, p);
1539 type, pid, options,
1540 infop, stat_addr, ru);
1541 if (ret) 1544 if (ret)
1542 return ret; 1545 return ret;
1543 } 1546 }
@@ -1546,22 +1549,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1546 return 0; 1549 return 0;
1547} 1550}
1548 1551
1549static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, 1552static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1550 enum pid_type type, struct pid *pid, int options,
1551 struct siginfo __user *infop, int __user *stat_addr,
1552 struct rusage __user *ru)
1553{ 1553{
1554 struct task_struct *p; 1554 struct task_struct *p;
1555 1555
1556 /*
1557 * Traditionally we see ptrace'd stopped tasks regardless of options.
1558 */
1559 options |= WUNTRACED;
1560
1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1556 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1562 int ret = wait_consider_task(tsk, 1, p, notask_error, 1557 int ret = wait_consider_task(wo, tsk, 1, p);
1563 type, pid, options,
1564 infop, stat_addr, ru);
1565 if (ret) 1558 if (ret)
1566 return ret; 1559 return ret;
1567 } 1560 }
@@ -1569,65 +1562,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1569 return 0; 1562 return 0;
1570} 1563}
1571 1564
1572static long do_wait(enum pid_type type, struct pid *pid, int options, 1565static long do_wait(struct wait_opts *wo)
1573 struct siginfo __user *infop, int __user *stat_addr,
1574 struct rusage __user *ru)
1575{ 1566{
1576 DECLARE_WAITQUEUE(wait, current); 1567 DECLARE_WAITQUEUE(wait, current);
1577 struct task_struct *tsk; 1568 struct task_struct *tsk;
1578 int retval; 1569 int retval;
1579 1570
1580 trace_sched_process_wait(pid); 1571 trace_sched_process_wait(wo->wo_pid);
1581 1572
1582 add_wait_queue(&current->signal->wait_chldexit,&wait); 1573 add_wait_queue(&current->signal->wait_chldexit,&wait);
1583repeat: 1574repeat:
1584 /* 1575 /*
1585 * If there is nothing that can match our critiera just get out. 1576 * If there is nothing that can match our critiera just get out.
1586 * We will clear @retval to zero if we see any child that might later 1577 * We will clear ->notask_error to zero if we see any child that
1587 * match our criteria, even if we are not able to reap it yet. 1578 * might later match our criteria, even if we are not able to reap
1579 * it yet.
1588 */ 1580 */
1589 retval = -ECHILD; 1581 wo->notask_error = -ECHILD;
1590 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1582 if ((wo->wo_type < PIDTYPE_MAX) &&
1591 goto end; 1583 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1584 goto notask;
1592 1585
1593 current->state = TASK_INTERRUPTIBLE; 1586 set_current_state(TASK_INTERRUPTIBLE);
1594 read_lock(&tasklist_lock); 1587 read_lock(&tasklist_lock);
1595 tsk = current; 1588 tsk = current;
1596 do { 1589 do {
1597 int tsk_result = do_wait_thread(tsk, &retval, 1590 retval = do_wait_thread(wo, tsk);
1598 type, pid, options, 1591 if (retval)
1599 infop, stat_addr, ru); 1592 goto end;
1600 if (!tsk_result) 1593
1601 tsk_result = ptrace_do_wait(tsk, &retval, 1594 retval = ptrace_do_wait(wo, tsk);
1602 type, pid, options, 1595 if (retval)
1603 infop, stat_addr, ru);
1604 if (tsk_result) {
1605 /*
1606 * tasklist_lock is unlocked and we have a final result.
1607 */
1608 retval = tsk_result;
1609 goto end; 1596 goto end;
1610 }
1611 1597
1612 if (options & __WNOTHREAD) 1598 if (wo->wo_flags & __WNOTHREAD)
1613 break; 1599 break;
1614 tsk = next_thread(tsk); 1600 } while_each_thread(current, tsk);
1615 BUG_ON(tsk->signal != current->signal);
1616 } while (tsk != current);
1617 read_unlock(&tasklist_lock); 1601 read_unlock(&tasklist_lock);
1618 1602
1619 if (!retval && !(options & WNOHANG)) { 1603notask:
1604 retval = wo->notask_error;
1605 if (!retval && !(wo->wo_flags & WNOHANG)) {
1620 retval = -ERESTARTSYS; 1606 retval = -ERESTARTSYS;
1621 if (!signal_pending(current)) { 1607 if (!signal_pending(current)) {
1622 schedule(); 1608 schedule();
1623 goto repeat; 1609 goto repeat;
1624 } 1610 }
1625 } 1611 }
1626
1627end: 1612end:
1628 current->state = TASK_RUNNING; 1613 __set_current_state(TASK_RUNNING);
1629 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1614 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1630 if (infop) { 1615 if (wo->wo_info) {
1616 struct siginfo __user *infop = wo->wo_info;
1617
1631 if (retval > 0) 1618 if (retval > 0)
1632 retval = 0; 1619 retval = 0;
1633 else { 1620 else {
@@ -1656,6 +1643,7 @@ end:
1656SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1643SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1657 infop, int, options, struct rusage __user *, ru) 1644 infop, int, options, struct rusage __user *, ru)
1658{ 1645{
1646 struct wait_opts wo;
1659 struct pid *pid = NULL; 1647 struct pid *pid = NULL;
1660 enum pid_type type; 1648 enum pid_type type;
1661 long ret; 1649 long ret;
@@ -1685,7 +1673,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1685 1673
1686 if (type < PIDTYPE_MAX) 1674 if (type < PIDTYPE_MAX)
1687 pid = find_get_pid(upid); 1675 pid = find_get_pid(upid);
1688 ret = do_wait(type, pid, options, infop, NULL, ru); 1676
1677 wo.wo_type = type;
1678 wo.wo_pid = pid;
1679 wo.wo_flags = options;
1680 wo.wo_info = infop;
1681 wo.wo_stat = NULL;
1682 wo.wo_rusage = ru;
1683 ret = do_wait(&wo);
1689 put_pid(pid); 1684 put_pid(pid);
1690 1685
1691 /* avoid REGPARM breakage on x86: */ 1686 /* avoid REGPARM breakage on x86: */
@@ -1696,6 +1691,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1696SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1691SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1697 int, options, struct rusage __user *, ru) 1692 int, options, struct rusage __user *, ru)
1698{ 1693{
1694 struct wait_opts wo;
1699 struct pid *pid = NULL; 1695 struct pid *pid = NULL;
1700 enum pid_type type; 1696 enum pid_type type;
1701 long ret; 1697 long ret;
@@ -1717,7 +1713,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1717 pid = find_get_pid(upid); 1713 pid = find_get_pid(upid);
1718 } 1714 }
1719 1715
1720 ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); 1716 wo.wo_type = type;
1717 wo.wo_pid = pid;
1718 wo.wo_flags = options | WEXITED;
1719 wo.wo_info = NULL;
1720 wo.wo_stat = stat_addr;
1721 wo.wo_rusage = ru;
1722 ret = do_wait(&wo);
1721 put_pid(pid); 1723 put_pid(pid);
1722 1724
1723 /* avoid REGPARM breakage on x86: */ 1725 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index be022c200da6..e6c04d462ab2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 567 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 568 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 569 */
571 if (tsk->clear_child_tid 570 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 571 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 572 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 573 /*
574 * We don't check the error code - if userspace has
575 * not set up a proper pointer then tough luck.
576 */
577 put_user(0, tsk->clear_child_tid);
578 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
579 1, NULL, NULL, 0);
580 }
575 tsk->clear_child_tid = NULL; 581 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 582 }
584} 583}
585 584
@@ -816,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
816{ 815{
817 struct signal_struct *sig; 816 struct signal_struct *sig;
818 817
819 if (clone_flags & CLONE_THREAD) { 818 if (clone_flags & CLONE_THREAD)
820 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live);
822 return 0; 819 return 0;
823 }
824 820
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 821 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
826 tsk->signal = sig; 822 tsk->signal = sig;
@@ -878,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig)
878 kmem_cache_free(signal_cachep, sig); 874 kmem_cache_free(signal_cachep, sig);
879} 875}
880 876
881static void cleanup_signal(struct task_struct *tsk)
882{
883 struct signal_struct *sig = tsk->signal;
884
885 atomic_dec(&sig->live);
886
887 if (atomic_dec_and_test(&sig->count))
888 __cleanup_signal(sig);
889}
890
891static void copy_flags(unsigned long clone_flags, struct task_struct *p) 877static void copy_flags(unsigned long clone_flags, struct task_struct *p)
892{ 878{
893 unsigned long new_flags = p->flags; 879 unsigned long new_flags = p->flags;
@@ -1029,7 +1015,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1029 p->vfork_done = NULL; 1015 p->vfork_done = NULL;
1030 spin_lock_init(&p->alloc_lock); 1016 spin_lock_init(&p->alloc_lock);
1031 1017
1032 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1033 init_sigpending(&p->pending); 1018 init_sigpending(&p->pending);
1034 1019
1035 p->utime = cputime_zero; 1020 p->utime = cputime_zero;
@@ -1241,6 +1226,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1241 } 1226 }
1242 1227
1243 if (clone_flags & CLONE_THREAD) { 1228 if (clone_flags & CLONE_THREAD) {
1229 atomic_inc(&current->signal->count);
1230 atomic_inc(&current->signal->live);
1244 p->group_leader = current->group_leader; 1231 p->group_leader = current->group_leader;
1245 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1232 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1246 } 1233 }
@@ -1270,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1270 write_unlock_irq(&tasklist_lock); 1257 write_unlock_irq(&tasklist_lock);
1271 proc_fork_connector(p); 1258 proc_fork_connector(p);
1272 cgroup_post_fork(p); 1259 cgroup_post_fork(p);
1260 perf_counter_fork(p);
1273 return p; 1261 return p;
1274 1262
1275bad_fork_free_pid: 1263bad_fork_free_pid:
@@ -1283,7 +1271,8 @@ bad_fork_cleanup_mm:
1283 if (p->mm) 1271 if (p->mm)
1284 mmput(p->mm); 1272 mmput(p->mm);
1285bad_fork_cleanup_signal: 1273bad_fork_cleanup_signal:
1286 cleanup_signal(p); 1274 if (!(clone_flags & CLONE_THREAD))
1275 __cleanup_signal(p->signal);
1287bad_fork_cleanup_sighand: 1276bad_fork_cleanup_sighand:
1288 __cleanup_sighand(p->sighand); 1277 __cleanup_sighand(p->sighand);
1289bad_fork_cleanup_fs: 1278bad_fork_cleanup_fs:
@@ -1409,12 +1398,6 @@ long do_fork(unsigned long clone_flags,
1409 if (clone_flags & CLONE_VFORK) { 1398 if (clone_flags & CLONE_VFORK) {
1410 p->vfork_done = &vfork; 1399 p->vfork_done = &vfork;
1411 init_completion(&vfork); 1400 init_completion(&vfork);
1412 } else if (!(clone_flags & CLONE_VM)) {
1413 /*
1414 * vfork will do an exec which will call
1415 * set_task_comm()
1416 */
1417 perf_counter_fork(p);
1418 } 1401 }
1419 1402
1420 audit_finish_fork(p); 1403 audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..e18cfbdc7190 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
247 if (err < 0) 247 if (err < 0)
248 return err; 248 return err;
249 249
250 page = compound_head(page);
250 lock_page(page); 251 lock_page(page);
251 if (!page->mapping) { 252 if (!page->mapping) {
252 unlock_page(page); 253 unlock_page(page);
@@ -284,6 +285,25 @@ void put_futex_key(int fshared, union futex_key *key)
284 drop_futex_key_refs(key); 285 drop_futex_key_refs(key);
285} 286}
286 287
288/*
289 * fault_in_user_writeable - fault in user address and verify RW access
290 * @uaddr: pointer to faulting user space address
291 *
292 * Slow path to fixup the fault we just took in the atomic write
293 * access to @uaddr.
294 *
295 * We have no generic implementation of a non destructive write to the
296 * user address. We know that we faulted in the atomic pagefault
297 * disabled section so we can as well avoid the #PF overhead by
298 * calling get_user_pages() right away.
299 */
300static int fault_in_user_writeable(u32 __user *uaddr)
301{
302 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
303 1, 1, 0, NULL, NULL);
304 return ret < 0 ? ret : 0;
305}
306
287/** 307/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex 308 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in 309 * @hb: the hash bucket the futex_q's reside in
@@ -896,7 +916,6 @@ retry:
896retry_private: 916retry_private:
897 op_ret = futex_atomic_op_inuser(op, uaddr2); 917 op_ret = futex_atomic_op_inuser(op, uaddr2);
898 if (unlikely(op_ret < 0)) { 918 if (unlikely(op_ret < 0)) {
899 u32 dummy;
900 919
901 double_unlock_hb(hb1, hb2); 920 double_unlock_hb(hb1, hb2);
902 921
@@ -914,7 +933,7 @@ retry_private:
914 goto out_put_keys; 933 goto out_put_keys;
915 } 934 }
916 935
917 ret = get_user(dummy, uaddr2); 936 ret = fault_in_user_writeable(uaddr2);
918 if (ret) 937 if (ret)
919 goto out_put_keys; 938 goto out_put_keys;
920 939
@@ -991,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1010 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q 1011 * q: the futex_q
993 * key: the key of the requeue target futex 1012 * key: the key of the requeue target futex
1013 * hb: the hash_bucket of the requeue target futex
994 * 1014 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1015 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1016 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right 1017 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1018 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held. 1019 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1020 * to protect access to the pi_state to fixup the owner later. Must be called
1021 * with both q->lock_ptr and hb->lock held.
1000 */ 1022 */
1001static inline 1023static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) 1024void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1025 struct futex_hash_bucket *hb)
1003{ 1026{
1004 drop_futex_key_refs(&q->key); 1027 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key); 1028 get_futex_key_refs(key);
@@ -1011,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1011 WARN_ON(!q->rt_waiter); 1034 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL; 1035 q->rt_waiter = NULL;
1013 1036
1037 q->lock_ptr = &hb->lock;
1038#ifdef CONFIG_DEBUG_PI_LIST
1039 q->list.plist.lock = &hb->lock;
1040#endif
1041
1014 wake_up_state(q->task, TASK_NORMAL); 1042 wake_up_state(q->task, TASK_NORMAL);
1015} 1043}
1016 1044
@@ -1069,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1097 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters); 1098 set_waiters);
1071 if (ret == 1) 1099 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2); 1100 requeue_pi_wake_futex(top_waiter, key2, hb2);
1073 1101
1074 return ret; 1102 return ret;
1075} 1103}
@@ -1204,7 +1232,7 @@ retry_private:
1204 double_unlock_hb(hb1, hb2); 1232 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2); 1233 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1); 1234 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2); 1235 ret = fault_in_user_writeable(uaddr2);
1208 if (!ret) 1236 if (!ret)
1209 goto retry; 1237 goto retry;
1210 goto out; 1238 goto out;
@@ -1228,8 +1256,15 @@ retry_private:
1228 if (!match_futex(&this->key, &key1)) 1256 if (!match_futex(&this->key, &key1))
1229 continue; 1257 continue;
1230 1258
1231 WARN_ON(!requeue_pi && this->rt_waiter); 1259 /*
1232 WARN_ON(requeue_pi && !this->rt_waiter); 1260 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1261 * be paired with each other and no other futex ops.
1262 */
1263 if ((requeue_pi && !this->rt_waiter) ||
1264 (!requeue_pi && this->rt_waiter)) {
1265 ret = -EINVAL;
1266 break;
1267 }
1233 1268
1234 /* 1269 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1270 * Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1254,7 +1289,7 @@ retry_private:
1254 this->task, 1); 1289 this->task, 1);
1255 if (ret == 1) { 1290 if (ret == 1) {
1256 /* We got the lock. */ 1291 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2); 1292 requeue_pi_wake_futex(this, &key2, hb2);
1258 continue; 1293 continue;
1259 } else if (ret) { 1294 } else if (ret) {
1260 /* -EDEADLK */ 1295 /* -EDEADLK */
@@ -1482,7 +1517,7 @@ retry:
1482handle_fault: 1517handle_fault:
1483 spin_unlock(q->lock_ptr); 1518 spin_unlock(q->lock_ptr);
1484 1519
1485 ret = get_user(uval, uaddr); 1520 ret = fault_in_user_writeable(uaddr);
1486 1521
1487 spin_lock(q->lock_ptr); 1522 spin_lock(q->lock_ptr);
1488 1523
@@ -1807,7 +1842,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1807{ 1842{
1808 struct hrtimer_sleeper timeout, *to = NULL; 1843 struct hrtimer_sleeper timeout, *to = NULL;
1809 struct futex_hash_bucket *hb; 1844 struct futex_hash_bucket *hb;
1810 u32 uval;
1811 struct futex_q q; 1845 struct futex_q q;
1812 int res, ret; 1846 int res, ret;
1813 1847
@@ -1909,16 +1943,9 @@ out:
1909 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1943 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910 1944
1911uaddr_faulted: 1945uaddr_faulted:
1912 /*
1913 * We have to r/w *(int __user *)uaddr, and we have to modify it
1914 * atomically. Therefore, if we continue to fault after get_user()
1915 * below, we need to handle the fault ourselves, while still holding
1916 * the mmap_sem. This can occur if the uaddr is under contention as
1917 * we have to drop the mmap_sem in order to call get_user().
1918 */
1919 queue_unlock(&q, hb); 1946 queue_unlock(&q, hb);
1920 1947
1921 ret = get_user(uval, uaddr); 1948 ret = fault_in_user_writeable(uaddr);
1922 if (ret) 1949 if (ret)
1923 goto out_put_key; 1950 goto out_put_key;
1924 1951
@@ -2013,17 +2040,10 @@ out:
2013 return ret; 2040 return ret;
2014 2041
2015pi_faulted: 2042pi_faulted:
2016 /*
2017 * We have to r/w *(int __user *)uaddr, and we have to modify it
2018 * atomically. Therefore, if we continue to fault after get_user()
2019 * below, we need to handle the fault ourselves, while still holding
2020 * the mmap_sem. This can occur if the uaddr is under contention as
2021 * we have to drop the mmap_sem in order to call get_user().
2022 */
2023 spin_unlock(&hb->lock); 2043 spin_unlock(&hb->lock);
2024 put_futex_key(fshared, &key); 2044 put_futex_key(fshared, &key);
2025 2045
2026 ret = get_user(uval, uaddr); 2046 ret = fault_in_user_writeable(uaddr);
2027 if (!ret) 2047 if (!ret)
2028 goto retry; 2048 goto retry;
2029 2049
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000000..22e9dcfaa3d3
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
1menu "GCOV-based kernel profiling"
2
3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS
6 default n
7 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage
9 measurements).
10
11 If unsure, say N.
12
13 Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
14 for the entire kernel. To enable profiling for specific files or
15 directories, add a line similar to the following to the respective
16 Makefile:
17
18 For a single file (e.g. main.o):
19 GCOV_PROFILE_main.o := y
20
21 For all files in one directory:
22 GCOV_PROFILE := y
23
24 To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
25 is specified, use:
26
27 GCOV_PROFILE_main.o := n
28 and:
29 GCOV_PROFILE := n
30
31 Note that the debugfs filesystem has to be mounted to access
32 profiling data.
33
34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL
37 depends on S390 || X86
38 default n
39 ---help---
40 This options activates profiling for the entire kernel.
41
42 If unsure, say N.
43
44 Note that a kernel compiled with profiling flags will be significantly
45 larger and run slower. Also be sure to exclude files from profiling
46 which are not linked to the kernel image to prevent linker errors.
47
48endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000000..3f761001d517
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000000..9b22d03cc581
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
1/*
2 * This code maintains a list of active profiling data structures.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 */
15
16#define pr_fmt(fmt) "gcov: " fmt
17
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include "gcov.h"
22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock);
26
27/*
28 * __gcov_init is called by gcc-generated constructor code for each object
29 * file compiled with -fprofile-arcs.
30 */
31void __gcov_init(struct gcov_info *info)
32{
33 static unsigned int gcov_version;
34
35 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) {
37 gcov_version = info->version;
38 /*
39 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports.
41 */
42 pr_info("version magic: 0x%x\n", gcov_version);
43 }
44 /*
45 * Add new profiling data structure to list and inform event
46 * listener.
47 */
48 info->next = gcov_info_head;
49 gcov_info_head = info;
50 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock);
53}
54EXPORT_SYMBOL(__gcov_init);
55
56/*
57 * These functions may be referenced by gcc-generated profiling code but serve
58 * no function for kernel profiling.
59 */
60void __gcov_flush(void)
61{
62 /* Unused. */
63}
64EXPORT_SYMBOL(__gcov_flush);
65
66void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
67{
68 /* Unused. */
69}
70EXPORT_SYMBOL(__gcov_merge_add);
71
72void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
73{
74 /* Unused. */
75}
76EXPORT_SYMBOL(__gcov_merge_single);
77
78void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
79{
80 /* Unused. */
81}
82EXPORT_SYMBOL(__gcov_merge_delta);
83
84/**
85 * gcov_enable_events - enable event reporting through gcov_event()
86 *
87 * Turn on reporting of profiling data load/unload-events through the
88 * gcov_event() callback. Also replay all previous events once. This function
89 * is needed because some events are potentially generated too early for the
90 * callback implementation to handle them initially.
91 */
92void gcov_enable_events(void)
93{
94 struct gcov_info *info;
95
96 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1;
98 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next)
100 gcov_event(GCOV_ADD, info);
101 mutex_unlock(&gcov_lock);
102}
103
104#ifdef CONFIG_MODULES
105static inline int within(void *addr, void *start, unsigned long size)
106{
107 return ((addr >= start) && (addr < start + size));
108}
109
110/* Update list and generate events when modules are unloaded. */
111static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data)
113{
114 struct module *mod = data;
115 struct gcov_info *info;
116 struct gcov_info *prev;
117
118 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK;
120 mutex_lock(&gcov_lock);
121 prev = NULL;
122 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) {
124 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev)
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info);
131 } else
132 prev = info;
133 }
134 mutex_unlock(&gcov_lock);
135
136 return NOTIFY_OK;
137}
138
139static struct notifier_block gcov_nb = {
140 .notifier_call = gcov_module_notifier,
141};
142
143static int __init gcov_init(void)
144{
145 return register_module_notifier(&gcov_nb);
146}
147device_initcall(gcov_init);
148#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000000..ef3c3f88a7a3
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
1/*
2 * This code exports profiling data as debugfs files to userspace.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 * Yi CDL Yang
15 */
16
17#define pr_fmt(fmt) "gcov: " fmt
18
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/debugfs.h>
22#include <linux/fs.h>
23#include <linux/list.h>
24#include <linux/string.h>
25#include <linux/slab.h>
26#include <linux/mutex.h>
27#include <linux/seq_file.h>
28#include "gcov.h"
29
30/**
31 * struct gcov_node - represents a debugfs entry
32 * @list: list head for child node list
33 * @children: child nodes
34 * @all: list head for list of all nodes
35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory
37 * @ghost: when an object file containing profiling data is unloaded we keep a
38 * copy of the profiling data here to allow collecting coverage data
39 * for cleanup code. Such a node is called a "ghost".
40 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links
42 * @name: data file basename
43 *
44 * struct gcov_node represents an entity within the gcov/ subdirectory
45 * of debugfs. There are directory and data file nodes. The latter represent
46 * the actual synthesized data file plus any associated symbolic links which
47 * are needed by the gcov tool to work correctly.
48 */
49struct gcov_node {
50 struct list_head list;
51 struct list_head children;
52 struct list_head all;
53 struct gcov_node *parent;
54 struct gcov_info *info;
55 struct gcov_info *ghost;
56 struct dentry *dentry;
57 struct dentry **links;
58 char name[0];
59};
60
61static const char objtree[] = OBJTREE;
62static const char srctree[] = SRCTREE;
63static struct gcov_node root_node;
64static struct dentry *reset_dentry;
65static LIST_HEAD(all_head);
66static DEFINE_MUTEX(node_lock);
67
68/* If non-zero, keep copies of profiling data for unloaded modules. */
69static int gcov_persist = 1;
70
71static int __init gcov_persist_setup(char *str)
72{
73 unsigned long val;
74
75 if (strict_strtoul(str, 0, &val)) {
76 pr_warning("invalid gcov_persist parameter '%s'\n", str);
77 return 0;
78 }
79 gcov_persist = val;
80 pr_info("setting gcov_persist to %d\n", gcov_persist);
81
82 return 1;
83}
84__setup("gcov_persist=", gcov_persist_setup);
85
86/*
87 * seq_file.start() implementation for gcov data files. Note that the
88 * gcov_iterator interface is designed to be more restrictive than seq_file
89 * (no start from arbitrary position, etc.), to simplify the iterator
90 * implementation.
91 */
92static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
93{
94 loff_t i;
95
96 gcov_iter_start(seq->private);
97 for (i = 0; i < *pos; i++) {
98 if (gcov_iter_next(seq->private))
99 return NULL;
100 }
101 return seq->private;
102}
103
104/* seq_file.next() implementation for gcov data files. */
105static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
106{
107 struct gcov_iterator *iter = data;
108
109 if (gcov_iter_next(iter))
110 return NULL;
111 (*pos)++;
112
113 return iter;
114}
115
116/* seq_file.show() implementation for gcov data files. */
117static int gcov_seq_show(struct seq_file *seq, void *data)
118{
119 struct gcov_iterator *iter = data;
120
121 if (gcov_iter_write(iter, seq))
122 return -EINVAL;
123 return 0;
124}
125
126static void gcov_seq_stop(struct seq_file *seq, void *data)
127{
128 /* Unused. */
129}
130
131static const struct seq_operations gcov_seq_ops = {
132 .start = gcov_seq_start,
133 .next = gcov_seq_next,
134 .show = gcov_seq_show,
135 .stop = gcov_seq_stop,
136};
137
138/*
139 * Return the profiling data set for a given node. This can either be the
140 * original profiling data structure or a duplicate (also called "ghost")
141 * in case the associated object file has been unloaded.
142 */
143static struct gcov_info *get_node_info(struct gcov_node *node)
144{
145 if (node->info)
146 return node->info;
147
148 return node->ghost;
149}
150
151/*
152 * open() implementation for gcov data files. Create a copy of the profiling
153 * data set and initialize the iterator and seq_file interface.
154 */
155static int gcov_seq_open(struct inode *inode, struct file *file)
156{
157 struct gcov_node *node = inode->i_private;
158 struct gcov_iterator *iter;
159 struct seq_file *seq;
160 struct gcov_info *info;
161 int rc = -ENOMEM;
162
163 mutex_lock(&node_lock);
164 /*
165 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access.
167 */
168 info = gcov_info_dup(get_node_info(node));
169 if (!info)
170 goto out_unlock;
171 iter = gcov_iter_new(info);
172 if (!iter)
173 goto err_free_info;
174 rc = seq_open(file, &gcov_seq_ops);
175 if (rc)
176 goto err_free_iter_info;
177 seq = file->private_data;
178 seq->private = iter;
179out_unlock:
180 mutex_unlock(&node_lock);
181 return rc;
182
183err_free_iter_info:
184 gcov_iter_free(iter);
185err_free_info:
186 gcov_info_free(info);
187 goto out_unlock;
188}
189
190/*
191 * release() implementation for gcov data files. Release resources allocated
192 * by open().
193 */
194static int gcov_seq_release(struct inode *inode, struct file *file)
195{
196 struct gcov_iterator *iter;
197 struct gcov_info *info;
198 struct seq_file *seq;
199
200 seq = file->private_data;
201 iter = seq->private;
202 info = gcov_iter_get_info(iter);
203 gcov_iter_free(iter);
204 gcov_info_free(info);
205 seq_release(inode, file);
206
207 return 0;
208}
209
210/*
211 * Find a node by the associated data file name. Needs to be called with
212 * node_lock held.
213 */
214static struct gcov_node *get_node_by_name(const char *name)
215{
216 struct gcov_node *node;
217 struct gcov_info *info;
218
219 list_for_each_entry(node, &all_head, all) {
220 info = get_node_info(node);
221 if (info && (strcmp(info->filename, name) == 0))
222 return node;
223 }
224
225 return NULL;
226}
227
228static void remove_node(struct gcov_node *node);
229
230/*
231 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is
233 * a "ghost" node), remove the debug fs node as well.
234 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos)
237{
238 struct seq_file *seq;
239 struct gcov_info *info;
240 struct gcov_node *node;
241
242 seq = file->private_data;
243 info = gcov_iter_get_info(seq->private);
244 mutex_lock(&node_lock);
245 node = get_node_by_name(info->filename);
246 if (node) {
247 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost)
249 remove_node(node);
250 else
251 gcov_info_reset(node->info);
252 }
253 /* Reset counts for open file. */
254 gcov_info_reset(info);
255 mutex_unlock(&node_lock);
256
257 return len;
258}
259
260/*
261 * Given a string <path> representing a file path of format:
262 * path/to/file.gcda
263 * construct and return a new string:
264 * <dir/>path/to/file.<ext>
265 */
266static char *link_target(const char *dir, const char *path, const char *ext)
267{
268 char *target;
269 char *old_ext;
270 char *copy;
271
272 copy = kstrdup(path, GFP_KERNEL);
273 if (!copy)
274 return NULL;
275 old_ext = strrchr(copy, '.');
276 if (old_ext)
277 *old_ext = '\0';
278 if (dir)
279 target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
280 else
281 target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
282 kfree(copy);
283
284 return target;
285}
286
287/*
288 * Construct a string representing the symbolic link target for the given
289 * gcov data file name and link type. Depending on the link type and the
290 * location of the data file, the link target can either point to a
291 * subdirectory of srctree, objtree or in an external location.
292 */
293static char *get_link_target(const char *filename, const struct gcov_link *ext)
294{
295 const char *rel;
296 char *result;
297
298 if (strncmp(filename, objtree, strlen(objtree)) == 0) {
299 rel = filename + strlen(objtree) + 1;
300 if (ext->dir == SRC_TREE)
301 result = link_target(srctree, rel, ext->ext);
302 else
303 result = link_target(objtree, rel, ext->ext);
304 } else {
305 /* External compilation. */
306 result = link_target(NULL, filename, ext->ext);
307 }
308
309 return result;
310}
311
312#define SKEW_PREFIX ".tmp_"
313
314/*
315 * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
316 * for filename skewing caused by the mod-versioning mechanism.
317 */
318static const char *deskew(const char *basename)
319{
320 if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
321 return basename + sizeof(SKEW_PREFIX) - 1;
322 return basename;
323}
324
325/*
326 * Create links to additional files (usually .c and .gcno files) which the
327 * gcov tool expects to find in the same directory as the gcov data file.
328 */
329static void add_links(struct gcov_node *node, struct dentry *parent)
330{
331 char *basename;
332 char *target;
333 int num;
334 int i;
335
336 for (num = 0; gcov_link[num].ext; num++)
337 /* Nothing. */;
338 node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
339 if (!node->links)
340 return;
341 for (i = 0; i < num; i++) {
342 target = get_link_target(get_node_info(node)->filename,
343 &gcov_link[i]);
344 if (!target)
345 goto out_err;
346 basename = strrchr(target, '/');
347 if (!basename)
348 goto out_err;
349 basename++;
350 node->links[i] = debugfs_create_symlink(deskew(basename),
351 parent, target);
352 if (!node->links[i])
353 goto out_err;
354 kfree(target);
355 }
356
357 return;
358out_err:
359 kfree(target);
360 while (i-- > 0)
361 debugfs_remove(node->links[i]);
362 kfree(node->links);
363 node->links = NULL;
364}
365
366static const struct file_operations gcov_data_fops = {
367 .open = gcov_seq_open,
368 .release = gcov_seq_release,
369 .read = seq_read,
370 .llseek = seq_lseek,
371 .write = gcov_seq_write,
372};
373
374/* Basic initialization of a new node. */
375static void init_node(struct gcov_node *node, struct gcov_info *info,
376 const char *name, struct gcov_node *parent)
377{
378 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all);
381 node->info = info;
382 node->parent = parent;
383 if (name)
384 strcpy(node->name, name);
385}
386
387/*
388 * Create a new node and associated debugfs entry. Needs to be called with
389 * node_lock held.
390 */
391static struct gcov_node *new_node(struct gcov_node *parent,
392 struct gcov_info *info, const char *name)
393{
394 struct gcov_node *node;
395
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) {
398 pr_warning("out of memory\n");
399 return NULL;
400 }
401 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */
403 if (info) {
404 node->dentry = debugfs_create_file(deskew(node->name), 0600,
405 parent->dentry, node, &gcov_data_fops);
406 } else
407 node->dentry = debugfs_create_dir(node->name, parent->dentry);
408 if (!node->dentry) {
409 pr_warning("could not create file\n");
410 kfree(node);
411 return NULL;
412 }
413 if (info)
414 add_links(node, parent->dentry);
415 list_add(&node->list, &parent->children);
416 list_add(&node->all, &all_head);
417
418 return node;
419}
420
421/* Remove symbolic links associated with node. */
422static void remove_links(struct gcov_node *node)
423{
424 int i;
425
426 if (!node->links)
427 return;
428 for (i = 0; gcov_link[i].ext; i++)
429 debugfs_remove(node->links[i]);
430 kfree(node->links);
431 node->links = NULL;
432}
433
434/*
435 * Remove node from all lists and debugfs and release associated resources.
436 * Needs to be called with node_lock held.
437 */
438static void release_node(struct gcov_node *node)
439{
440 list_del(&node->list);
441 list_del(&node->all);
442 debugfs_remove(node->dentry);
443 remove_links(node);
444 if (node->ghost)
445 gcov_info_free(node->ghost);
446 kfree(node);
447}
448
449/* Release node and empty parents. Needs to be called with node_lock held. */
450static void remove_node(struct gcov_node *node)
451{
452 struct gcov_node *parent;
453
454 while ((node != &root_node) && list_empty(&node->children)) {
455 parent = node->parent;
456 release_node(node);
457 node = parent;
458 }
459}
460
461/*
462 * Find child node with given basename. Needs to be called with node_lock
463 * held.
464 */
465static struct gcov_node *get_child_by_name(struct gcov_node *parent,
466 const char *name)
467{
468 struct gcov_node *node;
469
470 list_for_each_entry(node, &parent->children, list) {
471 if (strcmp(node->name, name) == 0)
472 return node;
473 }
474
475 return NULL;
476}
477
478/*
479 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes.
481 */
482static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos)
484{
485 struct gcov_node *node;
486
487 mutex_lock(&node_lock);
488restart:
489 list_for_each_entry(node, &all_head, all) {
490 if (node->info)
491 gcov_info_reset(node->info);
492 else if (list_empty(&node->children)) {
493 remove_node(node);
494 /* Several nodes may have gone - restart loop. */
495 goto restart;
496 }
497 }
498 mutex_unlock(&node_lock);
499
500 return len;
501}
502
503/* read() implementation for reset file. Unused. */
504static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
505 loff_t *pos)
506{
507 /* Allow read operation so that a recursive copy won't fail. */
508 return 0;
509}
510
511static const struct file_operations gcov_reset_fops = {
512 .write = reset_write,
513 .read = reset_read,
514};
515
516/*
517 * Create a node for a given profiling data set and add it to all lists and
518 * debugfs. Needs to be called with node_lock held.
519 */
520static void add_node(struct gcov_info *info)
521{
522 char *filename;
523 char *curr;
524 char *next;
525 struct gcov_node *parent;
526 struct gcov_node *node;
527
528 filename = kstrdup(info->filename, GFP_KERNEL);
529 if (!filename)
530 return;
531 parent = &root_node;
532 /* Create directory nodes along the path. */
533 for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
534 if (curr == next)
535 continue;
536 *next = 0;
537 if (strcmp(curr, ".") == 0)
538 continue;
539 if (strcmp(curr, "..") == 0) {
540 if (!parent->parent)
541 goto err_remove;
542 parent = parent->parent;
543 continue;
544 }
545 node = get_child_by_name(parent, curr);
546 if (!node) {
547 node = new_node(parent, NULL, curr);
548 if (!node)
549 goto err_remove;
550 }
551 parent = node;
552 }
553 /* Create file node. */
554 node = new_node(parent, info, curr);
555 if (!node)
556 goto err_remove;
557out:
558 kfree(filename);
559 return;
560
561err_remove:
562 remove_node(parent);
563 goto out;
564}
565
566/*
567 * The profiling data set associated with this node is being unloaded. Store a
568 * copy of the profiling data and turn this node into a "ghost".
569 */
570static int ghost_node(struct gcov_node *node)
571{
572 node->ghost = gcov_info_dup(node->info);
573 if (!node->ghost) {
574 pr_warning("could not save data for '%s' (out of memory)\n",
575 node->info->filename);
576 return -ENOMEM;
577 }
578 node->info = NULL;
579
580 return 0;
581}
582
583/*
584 * Profiling data for this node has been loaded again. Add profiling data
585 * from previous instantiation and turn this node into a regular node.
586 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info)
588{
589 if (gcov_info_is_compatible(node->ghost, info))
590 gcov_info_add(info, node->ghost);
591 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n",
593 info->filename);
594 }
595 gcov_info_free(node->ghost);
596 node->ghost = NULL;
597 node->info = info;
598}
599
600/*
601 * Callback to create/remove profiling files when code compiled with
602 * -fprofile-arcs is loaded/unloaded.
603 */
604void gcov_event(enum gcov_action action, struct gcov_info *info)
605{
606 struct gcov_node *node;
607
608 mutex_lock(&node_lock);
609 node = get_node_by_name(info->filename);
610 switch (action) {
611 case GCOV_ADD:
612 /* Add new node or revive ghost. */
613 if (!node) {
614 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break;
624 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */
626 if (!node) {
627 pr_warning("could not remove '%s' (not found)\n",
628 info->filename);
629 break;
630 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break;
637 }
638 mutex_unlock(&node_lock);
639}
640
641/* Create debugfs entries. */
642static __init int gcov_fs_init(void)
643{
644 int rc = -EIO;
645
646 init_node(&root_node, NULL, NULL, NULL);
647 /*
648 * /sys/kernel/debug/gcov will be parent for the reset control file
649 * and all profiling files.
650 */
651 root_node.dentry = debugfs_create_dir("gcov", NULL);
652 if (!root_node.dentry)
653 goto err_remove;
654 /*
655 * Create reset file which resets all profiling counts when written
656 * to.
657 */
658 reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
659 NULL, &gcov_reset_fops);
660 if (!reset_dentry)
661 goto err_remove;
662 /* Replay previous events to get our fs hierarchy up-to-date. */
663 gcov_enable_events();
664 return 0;
665
666err_remove:
667 pr_err("init failed\n");
668 if (root_node.dentry)
669 debugfs_remove(root_node.dentry);
670
671 return rc;
672}
673device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000000..ae5bb4260033
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 3.4. Future versions of gcc may change the gcov
4 * format (as happened before), so all format-specific information needs
5 * to be kept modular and easily exchangeable.
6 *
7 * This file is based on gcc-internal definitions. Functions and data
8 * structures are defined to be compatible with gcc counterparts.
9 * For a better understanding, refer to gcc source: gcc/gcov-io.h.
10 *
11 * Copyright IBM Corp. 2009
12 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 *
14 * Uses gcc-internal data definitions.
15 */
16
17#include <linux/errno.h>
18#include <linux/slab.h>
19#include <linux/string.h>
20#include <linux/seq_file.h>
21#include <linux/vmalloc.h>
22#include "gcov.h"
23
24/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
27 { 0, NULL},
28};
29
30/*
31 * Determine whether a counter is active. Based on gcc magic. Doesn't change
32 * at run-time.
33 */
34static int counter_active(struct gcov_info *info, unsigned int type)
35{
36 return (1 << type) & info->ctr_mask;
37}
38
39/* Determine number of active counters. Based on gcc magic. */
40static unsigned int num_counter_active(struct gcov_info *info)
41{
42 unsigned int i;
43 unsigned int result = 0;
44
45 for (i = 0; i < GCOV_COUNTERS; i++) {
46 if (counter_active(info, i))
47 result++;
48 }
49 return result;
50}
51
52/**
53 * gcov_info_reset - reset profiling data to zero
54 * @info: profiling data set
55 */
56void gcov_info_reset(struct gcov_info *info)
57{
58 unsigned int active = num_counter_active(info);
59 unsigned int i;
60
61 for (i = 0; i < active; i++) {
62 memset(info->counts[i].values, 0,
63 info->counts[i].num * sizeof(gcov_type));
64 }
65}
66
67/**
68 * gcov_info_is_compatible - check if profiling data can be added
69 * @info1: first profiling data set
70 * @info2: second profiling data set
71 *
72 * Returns non-zero if profiling data can be added, zero otherwise.
73 */
74int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
75{
76 return (info1->stamp == info2->stamp);
77}
78
79/**
80 * gcov_info_add - add up profiling data
81 * @dest: profiling data set to which data is added
82 * @source: profiling data set which is added
83 *
84 * Adds profiling counts of @source to @dest.
85 */
86void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
87{
88 unsigned int i;
89 unsigned int j;
90
91 for (i = 0; i < num_counter_active(dest); i++) {
92 for (j = 0; j < dest->counts[i].num; j++) {
93 dest->counts[i].values[j] +=
94 source->counts[i].values[j];
95 }
96 }
97}
98
99/* Get size of function info entry. Based on gcc magic. */
100static size_t get_fn_size(struct gcov_info *info)
101{
102 size_t size;
103
104 size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
105 sizeof(unsigned int);
106 if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
107 size = ALIGN(size, __alignof__(struct gcov_fn_info));
108 return size;
109}
110
111/* Get address of function info entry. Based on gcc magic. */
112static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
113{
114 return (struct gcov_fn_info *)
115 ((char *) info->functions + fn * get_fn_size(info));
116}
117
118/**
119 * gcov_info_dup - duplicate profiling data set
120 * @info: profiling data set to duplicate
121 *
122 * Return newly allocated duplicate on success, %NULL on error.
123 */
124struct gcov_info *gcov_info_dup(struct gcov_info *info)
125{
126 struct gcov_info *dup;
127 unsigned int i;
128 unsigned int active;
129
130 /* Duplicate gcov_info. */
131 active = num_counter_active(info);
132 dup = kzalloc(sizeof(struct gcov_info) +
133 sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
134 if (!dup)
135 return NULL;
136 dup->version = info->version;
137 dup->stamp = info->stamp;
138 dup->n_functions = info->n_functions;
139 dup->ctr_mask = info->ctr_mask;
140 /* Duplicate filename. */
141 dup->filename = kstrdup(info->filename, GFP_KERNEL);
142 if (!dup->filename)
143 goto err_free;
144 /* Duplicate table of functions. */
145 dup->functions = kmemdup(info->functions, info->n_functions *
146 get_fn_size(info), GFP_KERNEL);
147 if (!dup->functions)
148 goto err_free;
149 /* Duplicate counter arrays. */
150 for (i = 0; i < active ; i++) {
151 struct gcov_ctr_info *ctr = &info->counts[i];
152 size_t size = ctr->num * sizeof(gcov_type);
153
154 dup->counts[i].num = ctr->num;
155 dup->counts[i].merge = ctr->merge;
156 dup->counts[i].values = vmalloc(size);
157 if (!dup->counts[i].values)
158 goto err_free;
159 memcpy(dup->counts[i].values, ctr->values, size);
160 }
161 return dup;
162
163err_free:
164 gcov_info_free(dup);
165 return NULL;
166}
167
168/**
169 * gcov_info_free - release memory for profiling data set duplicate
170 * @info: profiling data set duplicate to free
171 */
172void gcov_info_free(struct gcov_info *info)
173{
174 unsigned int active = num_counter_active(info);
175 unsigned int i;
176
177 for (i = 0; i < active ; i++)
178 vfree(info->counts[i].values);
179 kfree(info->functions);
180 kfree(info->filename);
181 kfree(info);
182}
183
184/**
185 * struct type_info - iterator helper array
186 * @ctr_type: counter type
187 * @offset: index of the first value of the current function for this type
188 *
189 * This array is needed to convert the in-memory data format into the in-file
190 * data format:
191 *
192 * In-memory:
193 * for each counter type
194 * for each function
195 * values
196 *
197 * In-file:
198 * for each function
199 * for each counter type
200 * values
201 *
202 * See gcc source gcc/gcov-io.h for more information on data organization.
203 */
204struct type_info {
205 int ctr_type;
206 unsigned int offset;
207};
208
209/**
210 * struct gcov_iterator - specifies current file position in logical records
211 * @info: associated profiling data
212 * @record: record type
213 * @function: function number
214 * @type: counter type
215 * @count: index into values array
216 * @num_types: number of counter types
217 * @type_info: helper array to get values-array offset for current function
218 */
219struct gcov_iterator {
220 struct gcov_info *info;
221
222 int record;
223 unsigned int function;
224 unsigned int type;
225 unsigned int count;
226
227 int num_types;
228 struct type_info type_info[0];
229};
230
231static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
232{
233 return get_fn_info(iter->info, iter->function);
234}
235
236static struct type_info *get_type(struct gcov_iterator *iter)
237{
238 return &iter->type_info[iter->type];
239}
240
241/**
242 * gcov_iter_new - allocate and initialize profiling data iterator
243 * @info: profiling data set to be iterated
244 *
245 * Return file iterator on success, %NULL otherwise.
246 */
247struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
248{
249 struct gcov_iterator *iter;
250
251 iter = kzalloc(sizeof(struct gcov_iterator) +
252 num_counter_active(info) * sizeof(struct type_info),
253 GFP_KERNEL);
254 if (iter)
255 iter->info = info;
256
257 return iter;
258}
259
260/**
261 * gcov_iter_free - release memory for iterator
262 * @iter: file iterator to free
263 */
264void gcov_iter_free(struct gcov_iterator *iter)
265{
266 kfree(iter);
267}
268
269/**
270 * gcov_iter_get_info - return profiling data set for given file iterator
271 * @iter: file iterator
272 */
273struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
274{
275 return iter->info;
276}
277
278/**
279 * gcov_iter_start - reset file iterator to starting position
280 * @iter: file iterator
281 */
282void gcov_iter_start(struct gcov_iterator *iter)
283{
284 int i;
285
286 iter->record = 0;
287 iter->function = 0;
288 iter->type = 0;
289 iter->count = 0;
290 iter->num_types = 0;
291 for (i = 0; i < GCOV_COUNTERS; i++) {
292 if (counter_active(iter->info, i)) {
293 iter->type_info[iter->num_types].ctr_type = i;
294 iter->type_info[iter->num_types++].offset = 0;
295 }
296 }
297}
298
299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6
307#define RECORD_COUNT_TAG 7
308#define RECORD_COUNT_LEN 8
309#define RECORD_COUNT 9
310
311/**
312 * gcov_iter_next - advance file iterator to next logical record
313 * @iter: file iterator
314 *
315 * Return zero if new position is valid, non-zero if iterator has reached end.
316 */
317int gcov_iter_next(struct gcov_iterator *iter)
318{
319 switch (iter->record) {
320 case RECORD_FILE_MAGIC:
321 case RECORD_GCOV_VERSION:
322 case RECORD_FUNCTION_TAG:
323 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG:
326 /* Advance to next record */
327 iter->record++;
328 break;
329 case RECORD_COUNT:
330 /* Advance to next count */
331 iter->count++;
332 /* fall through */
333 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9;
336 break;
337 }
338 /* Advance to next counter type */
339 get_type(iter)->offset += iter->count;
340 iter->count = 0;
341 iter->type++;
342 /* fall through */
343 case RECORD_FUNCTION_CHECK:
344 if (iter->type < iter->num_types) {
345 iter->record = 7;
346 break;
347 }
348 /* Advance to next function */
349 iter->type = 0;
350 iter->function++;
351 /* fall through */
352 case RECORD_TIME_STAMP:
353 if (iter->function < iter->info->n_functions)
354 iter->record = 3;
355 else
356 iter->record = -1;
357 break;
358 }
359 /* Check for EOF. */
360 if (iter->record == -1)
361 return -EINVAL;
362 else
363 return 0;
364}
365
366/**
367 * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
368 * @seq: seq_file handle
369 * @v: value to be stored
370 *
371 * Number format defined by gcc: numbers are recorded in the 32 bit
372 * unsigned binary form of the endianness of the machine generating the
373 * file.
374 */
375static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
376{
377 return seq_write(seq, &v, sizeof(v));
378}
379
380/**
381 * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
382 * @seq: seq_file handle
383 * @v: value to be stored
384 *
385 * Number format defined by gcc: numbers are recorded in the 32 bit
386 * unsigned binary form of the endianness of the machine generating the
387 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
388 * first.
389 */
390static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
391{
392 u32 data[2];
393
394 data[0] = (v & 0xffffffffUL);
395 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data));
397}
398
399/**
400 * gcov_iter_write - write data for current pos to seq_file
401 * @iter: file iterator
402 * @seq: seq_file handle
403 *
404 * Return zero on success, non-zero otherwise.
405 */
406int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
407{
408 int rc = -EINVAL;
409
410 switch (iter->record) {
411 case RECORD_FILE_MAGIC:
412 rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
413 break;
414 case RECORD_GCOV_VERSION:
415 rc = seq_write_gcov_u32(seq, iter->info->version);
416 break;
417 case RECORD_TIME_STAMP:
418 rc = seq_write_gcov_u32(seq, iter->info->stamp);
419 break;
420 case RECORD_FUNCTION_TAG:
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break;
423 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2);
425 break;
426 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break;
429 case RECORD_FUNCTION_CHECK:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
431 break;
432 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq,
434 GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
435 break;
436 case RECORD_COUNT_LEN:
437 rc = seq_write_gcov_u32(seq,
438 get_func(iter)->n_ctrs[iter->type] * 2);
439 break;
440 case RECORD_COUNT:
441 rc = seq_write_gcov_u64(seq,
442 iter->info->counts[iter->type].
443 values[iter->count + get_type(iter)->offset]);
444 break;
445 }
446 return rc;
447}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000000..060073ebf7a6
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
1/*
2 * Profiling infrastructure declarations.
3 *
4 * This file is based on gcc-internal definitions. Data structures are
5 * defined to be compatible with gcc counterparts. For a better
6 * understanding, refer to gcc source: gcc/gcov-io.h.
7 *
8 * Copyright IBM Corp. 2009
9 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#ifndef GCOV_H
15#define GCOV_H GCOV_H
16
17#include <linux/types.h>
18
19/*
20 * Profiling data types used for gcc 3.4 and above - these are defined by
21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible.
23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
30
31#if BITS_PER_LONG >= 64
32typedef long gcov_type;
33#else
34typedef long long gcov_type;
35#endif
36
37/**
38 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier
40 * @checksum: function checksum
41 * @n_ctrs: number of values per counter type belonging to this function
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66
67/**
68 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation
70 * @next: list head for a singly-linked list
71 * @stamp: time stamp
72 * @filename: name of the associated gcov data file
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91
92/* Base interface. */
93enum gcov_action {
94 GCOV_ADD,
95 GCOV_REMOVE,
96};
97
98void gcov_event(enum gcov_action action, struct gcov_info *info);
99void gcov_enable_events(void);
100
101/* Iterator control. */
102struct seq_file;
103struct gcov_iterator;
104
105struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
106void gcov_iter_free(struct gcov_iterator *iter);
107void gcov_iter_start(struct gcov_iterator *iter);
108int gcov_iter_next(struct gcov_iterator *iter);
109int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
110struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
111
112/* gcov_info control. */
113void gcov_info_reset(struct gcov_info *info);
114int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
115void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
116struct gcov_info *gcov_info_dup(struct gcov_info *info);
117void gcov_info_free(struct gcov_info *info);
118
119struct gcov_link {
120 enum {
121 OBJ_TREE,
122 SRC_TREE,
123 } dir;
124 const char *ext;
125};
126extern const struct gcov_link gcov_link[];
127
128#endif /* GCOV_H */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b675a67c9ac3..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -380,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
380 return res; 386 return res;
381} 387}
382 388
389EXPORT_SYMBOL_GPL(ktime_add_safe);
390
383#ifdef CONFIG_DEBUG_OBJECTS_TIMERS 391#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
384 392
385static struct debug_obj_descr hrtimer_debug_descr; 393static struct debug_obj_descr hrtimer_debug_descr;
@@ -1274,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1274 1282
1275 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1276 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1277 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1278 1296
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1298 ktime_t basenow;
1281 struct rb_node *node; 1299 struct rb_node *node;
1282 1300
1283 spin_lock(&cpu_base->lock);
1284
1285 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1286 1302
1287 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1314,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1314 1330
1315 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1316 } 1332 }
1317 spin_unlock(&cpu_base->lock);
1318 base++; 1333 base++;
1319 } 1334 }
1320 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1321 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1322 1342
1323 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1324 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47 46
48/* 47/*
49 * Debugging printout: 48 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aaf5c9d05770..0ec9ed831737 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -443,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
443 return -1; 451 return -1;
444} 452}
445 453
454#ifdef CONFIG_SMP
455/*
456 * Check whether we need to change the affinity of the interrupt thread.
457 */
458static void
459irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
460{
461 cpumask_var_t mask;
462
463 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
464 return;
465
466 /*
467 * In case we are out of memory we set IRQTF_AFFINITY again and
468 * try again next time
469 */
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
471 set_bit(IRQTF_AFFINITY, &action->thread_flags);
472 return;
473 }
474
475 spin_lock_irq(&desc->lock);
476 cpumask_copy(mask, desc->affinity);
477 spin_unlock_irq(&desc->lock);
478
479 set_cpus_allowed_ptr(current, mask);
480 free_cpumask_var(mask);
481}
482#else
483static inline void
484irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
485#endif
486
446/* 487/*
447 * Interrupt handler thread 488 * Interrupt handler thread
448 */ 489 */
@@ -458,6 +499,8 @@ static int irq_thread(void *data)
458 499
459 while (!irq_wait_for_interrupt(action)) { 500 while (!irq_wait_for_interrupt(action)) {
460 501
502 irq_thread_check_affinity(desc, action);
503
461 atomic_inc(&desc->threads_active); 504 atomic_inc(&desc->threads_active);
462 505
463 spin_lock_irq(&desc->lock); 506 spin_lock_irq(&desc->lock);
@@ -564,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
564 */ 607 */
565 get_task_struct(t); 608 get_task_struct(t);
566 new->thread = t; 609 new->thread = t;
567 wake_up_process(t);
568 } 610 }
569 611
570 /* 612 /*
@@ -647,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
647 (int)(new->flags & IRQF_TRIGGER_MASK)); 689 (int)(new->flags & IRQF_TRIGGER_MASK));
648 } 690 }
649 691
692 new->irq = irq;
650 *old_ptr = new; 693 *old_ptr = new;
651 694
652 /* Reset broken irq detection when installing new handler */ 695 /* Reset broken irq detection when installing new handler */
@@ -664,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
664 707
665 spin_unlock_irqrestore(&desc->lock, flags); 708 spin_unlock_irqrestore(&desc->lock, flags);
666 709
667 new->irq = irq; 710 /*
711 * Strictly no need to wake it up, but hung_task complains
712 * when no hard interrupt wakes the thread up.
713 */
714 if (new->thread)
715 wake_up_process(new->thread);
716
668 register_irq_proc(irq, desc); 717 register_irq_proc(irq, desc);
669 new->dir = NULL; 718 new->dir = NULL;
670 register_handler_proc(irq, new); 719 register_handler_proc(irq, new);
@@ -718,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
718{ 767{
719 struct irq_desc *desc = irq_to_desc(irq); 768 struct irq_desc *desc = irq_to_desc(irq);
720 struct irqaction *action, **action_ptr; 769 struct irqaction *action, **action_ptr;
721 struct task_struct *irqthread;
722 unsigned long flags; 770 unsigned long flags;
723 771
724 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 772 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -766,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
766 desc->chip->disable(irq); 814 desc->chip->disable(irq);
767 } 815 }
768 816
769 irqthread = action->thread;
770 action->thread = NULL;
771
772 spin_unlock_irqrestore(&desc->lock, flags); 817 spin_unlock_irqrestore(&desc->lock, flags);
773 818
774 unregister_handler_proc(irq, action); 819 unregister_handler_proc(irq, action);
@@ -776,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
776 /* Make sure it's not being used on another CPU: */ 821 /* Make sure it's not being used on another CPU: */
777 synchronize_irq(irq); 822 synchronize_irq(irq);
778 823
779 if (irqthread) {
780 if (!test_bit(IRQTF_DIED, &action->thread_flags))
781 kthread_stop(irqthread);
782 put_task_struct(irqthread);
783 }
784
785#ifdef CONFIG_DEBUG_SHIRQ 824#ifdef CONFIG_DEBUG_SHIRQ
786 /* 825 /*
787 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 826 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -797,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
797 local_irq_restore(flags); 836 local_irq_restore(flags);
798 } 837 }
799#endif 838#endif
839
840 if (action->thread) {
841 if (!test_bit(IRQTF_DIED, &action->thread_flags))
842 kthread_stop(action->thread);
843 put_task_struct(action->thread);
844 }
845
800 return action; 846 return action;
801} 847}
802 848
@@ -856,7 +902,7 @@ EXPORT_SYMBOL(free_irq);
856 * still called in hard interrupt context and has to check 902 * still called in hard interrupt context and has to check
857 * whether the interrupt originates from the device. If yes it 903 * whether the interrupt originates from the device. If yes it
858 * needs to disable the interrupt on the device and return 904 * needs to disable the interrupt on the device and return
859 * IRQ_THREAD_WAKE which will wake up the handler thread and run 905 * IRQ_WAKE_THREAD which will wake up the handler thread and run
860 * @thread_fn. This split handler design is necessary to support 906 * @thread_fn. This split handler design is necessary to support
861 * shared interrupts. 907 * shared interrupts.
862 * 908 *
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
107 107
108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
109{ 109{
110 /* those all static, do move them */ 110 /* those static or target node is -1, do not move them */
111 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
112 return desc; 112 return desc;
113 113
114 if (desc->node != node) 114 if (desc->node != node)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..a92280870e30 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
@@ -38,6 +37,8 @@
38#include <linux/suspend.h> 37#include <linux/suspend.h>
39#include <asm/uaccess.h> 38#include <asm/uaccess.h>
40 39
40#include <trace/events/module.h>
41
41extern int max_threads; 42extern int max_threads;
42 43
43static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -109,6 +110,8 @@ int __request_module(bool wait, const char *fmt, ...)
109 return -ENOMEM; 110 return -ENOMEM;
110 } 111 }
111 112
113 trace_module_request(module_name, wait, _RET_IP_);
114
112 ret = call_usermodehelper(modprobe_path, argv, envp, 115 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 116 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
114 atomic_dec(&kmod_concurrent); 117 atomic_dec(&kmod_concurrent);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,18 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 int safety;
241 234
242 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 236 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 237 return -EAGAIN;
248 238
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
250 int i; 240 int i;
251 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
252 continue; 242 continue;
@@ -264,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
264void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
265{ 255{
266 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
267 struct hlist_node *pos;
268 257
269 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
270 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
271 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
272 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
273 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
274 if (dirty) { 263 if (dirty) {
275 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
276 kip->ngarbage++; 265 kip->ngarbage++;
277 } else { 266 } else
278 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
279 }
280 break; 268 break;
281 } 269 }
282 } 270 }
@@ -698,7 +686,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 686 p->addr = addr;
699 687
700 preempt_disable(); 688 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 689 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 690 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 691 preempt_enable();
704 return -EINVAL; 692 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7fa441333529..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,7 +27,6 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 struct completion started;
31 30
32 /* Result passed back to kthread_create() from kthreadd. */ 31 /* Result passed back to kthread_create() from kthreadd. */
33 struct task_struct *result; 32 struct task_struct *result;
@@ -36,17 +35,13 @@ struct kthread_create_info
36 struct list_head list; 35 struct list_head list;
37}; 36};
38 37
39struct kthread_stop_info 38struct kthread {
40{ 39 int should_stop;
41 struct task_struct *k; 40 struct completion exited;
42 int err;
43 struct completion done;
44}; 41};
45 42
46/* Thread stopping is done by setthing this var: lock serializes 43#define to_kthread(tsk) \
47 * multiple kthread_stop calls. */ 44 container_of((tsk)->vfork_done, struct kthread, exited)
48static DEFINE_MUTEX(kthread_stop_lock);
49static struct kthread_stop_info kthread_stop_info;
50 45
51/** 46/**
52 * kthread_should_stop - should this kthread return now? 47 * kthread_should_stop - should this kthread return now?
@@ -57,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
57 */ 52 */
58int kthread_should_stop(void) 53int kthread_should_stop(void)
59{ 54{
60 return (kthread_stop_info.k == current); 55 return to_kthread(current)->should_stop;
61} 56}
62EXPORT_SYMBOL(kthread_should_stop); 57EXPORT_SYMBOL(kthread_should_stop);
63 58
64static int kthread(void *_create) 59static int kthread(void *_create)
65{ 60{
61 /* Copy data: it's on kthread's stack */
66 struct kthread_create_info *create = _create; 62 struct kthread_create_info *create = _create;
67 int (*threadfn)(void *data); 63 int (*threadfn)(void *data) = create->threadfn;
68 void *data; 64 void *data = create->data;
69 int ret = -EINTR; 65 struct kthread self;
66 int ret;
70 67
71 /* Copy data: it's on kthread's stack */ 68 self.should_stop = 0;
72 threadfn = create->threadfn; 69 init_completion(&self.exited);
73 data = create->data; 70 current->vfork_done = &self.exited;
74 71
75 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
76 __set_current_state(TASK_UNINTERRUPTIBLE); 73 __set_current_state(TASK_UNINTERRUPTIBLE);
77 create->result = current; 74 create->result = current;
78 complete(&create->started); 75 complete(&create->done);
79 schedule(); 76 schedule();
80 77
81 if (!kthread_should_stop()) 78 ret = -EINTR;
79 if (!self.should_stop)
82 ret = threadfn(data); 80 ret = threadfn(data);
83 81
84 /* It might have exited on its own, w/o kthread_stop. Check. */ 82 /* we can't just return, we must preserve "self" on stack */
85 if (kthread_should_stop()) { 83 do_exit(ret);
86 kthread_stop_info.err = ret;
87 complete(&kthread_stop_info.done);
88 }
89 return 0;
90} 84}
91 85
92static void create_kthread(struct kthread_create_info *create) 86static void create_kthread(struct kthread_create_info *create)
@@ -95,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
95 89
96 /* We want our own signal handler (we take no signals by default). */ 90 /* We want our own signal handler (we take no signals by default). */
97 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 91 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
98 if (pid < 0) 92 if (pid < 0) {
99 create->result = ERR_PTR(pid); 93 create->result = ERR_PTR(pid);
100 else 94 complete(&create->done);
101 wait_for_completion(&create->started); 95 }
102 complete(&create->done);
103} 96}
104 97
105/** 98/**
@@ -130,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
130 123
131 create.threadfn = threadfn; 124 create.threadfn = threadfn;
132 create.data = data; 125 create.data = data;
133 init_completion(&create.started);
134 init_completion(&create.done); 126 init_completion(&create.done);
135 127
136 spin_lock(&kthread_create_lock); 128 spin_lock(&kthread_create_lock);
@@ -188,40 +180,34 @@ EXPORT_SYMBOL(kthread_bind);
188 * @k: thread created by kthread_create(). 180 * @k: thread created by kthread_create().
189 * 181 *
190 * Sets kthread_should_stop() for @k to return true, wakes it, and 182 * Sets kthread_should_stop() for @k to return true, wakes it, and
191 * waits for it to exit. Your threadfn() must not call do_exit() 183 * waits for it to exit. This can also be called after kthread_create()
192 * itself if you use this function! This can also be called after 184 * instead of calling wake_up_process(): the thread will exit without
193 * kthread_create() instead of calling wake_up_process(): the thread 185 * calling threadfn().
194 * will exit without calling threadfn(). 186 *
187 * If threadfn() may call do_exit() itself, the caller must ensure
188 * task_struct can't go away.
195 * 189 *
196 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 190 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
197 * was never called. 191 * was never called.
198 */ 192 */
199int kthread_stop(struct task_struct *k) 193int kthread_stop(struct task_struct *k)
200{ 194{
195 struct kthread *kthread;
201 int ret; 196 int ret;
202 197
203 mutex_lock(&kthread_stop_lock);
204
205 /* It could exit after stop_info.k set, but before wake_up_process. */
206 get_task_struct(k);
207
208 trace_sched_kthread_stop(k); 198 trace_sched_kthread_stop(k);
199 get_task_struct(k);
209 200
210 /* Must init completion *before* thread sees kthread_stop_info.k */ 201 kthread = to_kthread(k);
211 init_completion(&kthread_stop_info.done); 202 barrier(); /* it might have exited */
212 smp_wmb(); 203 if (k->vfork_done != NULL) {
204 kthread->should_stop = 1;
205 wake_up_process(k);
206 wait_for_completion(&kthread->exited);
207 }
208 ret = k->exit_code;
213 209
214 /* Now set kthread_should_stop() to true, and wake it up. */
215 kthread_stop_info.k = k;
216 wake_up_process(k);
217 put_task_struct(k); 210 put_task_struct(k);
218
219 /* Once it dies, reset stop ptr, gather result and we're done. */
220 wait_for_completion(&kthread_stop_info.done);
221 kthread_stop_info.k = NULL;
222 ret = kthread_stop_info.err;
223 mutex_unlock(&kthread_stop_lock);
224
225 trace_sched_kthread_stop_ret(ret); 211 trace_sched_kthread_stop_ret(ret);
226 212
227 return ret; 213 return ret;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..e94caa666dba 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 758 &proc_lockdep_stats_operations);
759 759
760#ifdef CONFIG_LOCK_STAT 760#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 761 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
762 &proc_lock_stat_operations);
762#endif 763#endif
763 764
764 return 0; 765 return 0;
diff --git a/kernel/module.c b/kernel/module.c
index 215aaab09e91..46580edff0cb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,11 @@
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h> 56#include <linux/kmemleak.h>
57 57
58#define CREATE_TRACE_POINTS
59#include <trace/events/module.h>
60
61EXPORT_TRACEPOINT_SYMBOL(module_get);
62
58#if 0 63#if 0
59#define DEBUGP printk 64#define DEBUGP printk
60#else 65#else
@@ -909,16 +914,18 @@ void __symbol_put(const char *symbol)
909} 914}
910EXPORT_SYMBOL(__symbol_put); 915EXPORT_SYMBOL(__symbol_put);
911 916
917/* Note this assumes addr is a function, which it currently always is. */
912void symbol_put_addr(void *addr) 918void symbol_put_addr(void *addr)
913{ 919{
914 struct module *modaddr; 920 struct module *modaddr;
921 unsigned long a = (unsigned long)dereference_function_descriptor(addr);
915 922
916 if (core_kernel_text((unsigned long)addr)) 923 if (core_kernel_text(a))
917 return; 924 return;
918 925
919 /* module_text_address is safe here: we're supposed to have reference 926 /* module_text_address is safe here: we're supposed to have reference
920 * to module from symbol_get, so it can't go away. */ 927 * to module from symbol_get, so it can't go away. */
921 modaddr = __module_text_address((unsigned long)addr); 928 modaddr = __module_text_address(a);
922 BUG_ON(!modaddr); 929 BUG_ON(!modaddr);
923 module_put(modaddr); 930 module_put(modaddr);
924} 931}
@@ -940,6 +947,8 @@ void module_put(struct module *module)
940 if (module) { 947 if (module) {
941 unsigned int cpu = get_cpu(); 948 unsigned int cpu = get_cpu();
942 local_dec(__module_ref_addr(module, cpu)); 949 local_dec(__module_ref_addr(module, cpu));
950 trace_module_put(module, _RET_IP_,
951 local_read(__module_ref_addr(module, cpu)));
943 /* Maybe they're waiting for us to drop reference? */ 952 /* Maybe they're waiting for us to drop reference? */
944 if (unlikely(!module_is_live(module))) 953 if (unlikely(!module_is_live(module)))
945 wake_up_process(module->waiter); 954 wake_up_process(module->waiter);
@@ -1068,7 +1077,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1077{
1069 const unsigned long *crc; 1078 const unsigned long *crc;
1070 1079
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1080 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1081 &crc, true, false))
1072 BUG(); 1082 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1083 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1084}
@@ -1271,6 +1281,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1271 struct module_notes_attrs *notes_attrs; 1281 struct module_notes_attrs *notes_attrs;
1272 struct bin_attribute *nattr; 1282 struct bin_attribute *nattr;
1273 1283
1284 /* failed to create section attributes, so can't create notes */
1285 if (!mod->sect_attrs)
1286 return;
1287
1274 /* Count notes sections and allocate structures. */ 1288 /* Count notes sections and allocate structures. */
1275 notes = 0; 1289 notes = 0;
1276 for (i = 0; i < nsect; i++) 1290 for (i = 0; i < nsect; i++)
@@ -1490,6 +1504,8 @@ static int __unlink_module(void *_mod)
1490/* Free a module, remove from lists, etc (must hold module_mutex). */ 1504/* Free a module, remove from lists, etc (must hold module_mutex). */
1491static void free_module(struct module *mod) 1505static void free_module(struct module *mod)
1492{ 1506{
1507 trace_module_free(mod);
1508
1493 /* Delete from various lists */ 1509 /* Delete from various lists */
1494 stop_machine(__unlink_module, mod, NULL); 1510 stop_machine(__unlink_module, mod, NULL);
1495 remove_notes_attrs(mod); 1511 remove_notes_attrs(mod);
@@ -2216,6 +2232,10 @@ static noinline struct module *load_module(void __user *umod,
2216 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, 2232 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2217 "__kcrctab_unused_gpl"); 2233 "__kcrctab_unused_gpl");
2218#endif 2234#endif
2235#ifdef CONFIG_CONSTRUCTORS
2236 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2237 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif
2219 2239
2220#ifdef CONFIG_MARKERS 2240#ifdef CONFIG_MARKERS
2221 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", 2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2353,6 +2373,8 @@ static noinline struct module *load_module(void __user *umod,
2353 /* Get rid of temporary copy */ 2373 /* Get rid of temporary copy */
2354 vfree(hdr); 2374 vfree(hdr);
2355 2375
2376 trace_module_load(mod);
2377
2356 /* Done! */ 2378 /* Done! */
2357 return mod; 2379 return mod;
2358 2380
@@ -2389,6 +2411,17 @@ static noinline struct module *load_module(void __user *umod,
2389 goto free_hdr; 2411 goto free_hdr;
2390} 2412}
2391 2413
2414/* Call module constructors. */
2415static void do_mod_ctors(struct module *mod)
2416{
2417#ifdef CONFIG_CONSTRUCTORS
2418 unsigned long i;
2419
2420 for (i = 0; i < mod->num_ctors; i++)
2421 mod->ctors[i]();
2422#endif
2423}
2424
2392/* This is where the real work happens */ 2425/* This is where the real work happens */
2393SYSCALL_DEFINE3(init_module, void __user *, umod, 2426SYSCALL_DEFINE3(init_module, void __user *, umod,
2394 unsigned long, len, const char __user *, uargs) 2427 unsigned long, len, const char __user *, uargs)
@@ -2417,6 +2450,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2417 blocking_notifier_call_chain(&module_notify_list, 2450 blocking_notifier_call_chain(&module_notify_list,
2418 MODULE_STATE_COMING, mod); 2451 MODULE_STATE_COMING, mod);
2419 2452
2453 do_mod_ctors(mod);
2420 /* Start the module */ 2454 /* Start the module */
2421 if (mod->init != NULL) 2455 if (mod->init != NULL)
2422 ret = do_one_initcall(mod->init); 2456 ret = do_one_initcall(mod->init);
@@ -2435,9 +2469,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2435 return ret; 2469 return ret;
2436 } 2470 }
2437 if (ret > 0) { 2471 if (ret > 0) {
2438 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2472 printk(KERN_WARNING
2439 "it should follow 0/-E convention\n" 2473"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2440 KERN_WARNING "%s: loading module anyway...\n", 2474"%s: loading module anyway...\n",
2441 __func__, mod->name, ret, 2475 __func__, mod->name, ret,
2442 __func__); 2476 __func__);
2443 dump_stack(); 2477 dump_stack();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0c..09b4ff9711b2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29/* 29static inline struct nsproxy *create_nsproxy(void)
30 * creates a copy of "orig" with refcount 1.
31 */
32static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
33{ 30{
34 struct nsproxy *ns; 31 struct nsproxy *nsproxy;
35 32
36 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); 33 nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
37 if (ns) { 34 if (nsproxy)
38 memcpy(ns, orig, sizeof(struct nsproxy)); 35 atomic_set(&nsproxy->count, 1);
39 atomic_set(&ns->count, 1); 36 return nsproxy;
40 }
41 return ns;
42} 37}
43 38
44/* 39/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
52 struct nsproxy *new_nsp; 47 struct nsproxy *new_nsp;
53 int err; 48 int err;
54 49
55 new_nsp = clone_nsproxy(tsk->nsproxy); 50 new_nsp = create_nsproxy();
56 if (!new_nsp) 51 if (!new_nsp)
57 return ERR_PTR(-ENOMEM); 52 return ERR_PTR(-ENOMEM);
58 53
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..d7cbc579fc80 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
@@ -49,7 +50,7 @@ static atomic_t nr_comm_counters __read_mostly;
49 * 1 - disallow cpu counters to unpriv 50 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv 51 * 2 - disallow kernel profiling to unpriv
51 */ 52 */
52int sysctl_perf_counter_paranoid __read_mostly; 53int sysctl_perf_counter_paranoid __read_mostly = 1;
53 54
54static inline bool perf_paranoid_cpu(void) 55static inline bool perf_paranoid_cpu(void)
55{ 56{
@@ -87,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 88void __weak hw_perf_enable(void) { barrier(); }
88 89
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 92
91int __weak 93int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 94hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -124,7 +126,7 @@ void perf_enable(void)
124 126
125static void get_ctx(struct perf_counter_context *ctx) 127static void get_ctx(struct perf_counter_context *ctx)
126{ 128{
127 atomic_inc(&ctx->refcount); 129 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128} 130}
129 131
130static void free_ctx(struct rcu_head *head) 132static void free_ctx(struct rcu_head *head)
@@ -146,6 +148,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 148 }
147} 149}
148 150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
149/* 173/*
150 * Get the perf_counter_context for a task and lock it. 174 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 175 * This has to cope with with the fact that until it is locked,
@@ -175,6 +199,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
175 spin_unlock_irqrestore(&ctx->lock, *flags); 199 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry; 200 goto retry;
177 } 201 }
202
203 if (!atomic_inc_not_zero(&ctx->refcount)) {
204 spin_unlock_irqrestore(&ctx->lock, *flags);
205 ctx = NULL;
206 }
178 } 207 }
179 rcu_read_unlock(); 208 rcu_read_unlock();
180 return ctx; 209 return ctx;
@@ -193,7 +222,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
193 ctx = perf_lock_task_context(task, &flags); 222 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) { 223 if (ctx) {
195 ++ctx->pin_count; 224 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags); 225 spin_unlock_irqrestore(&ctx->lock, flags);
198 } 226 }
199 return ctx; 227 return ctx;
@@ -232,6 +260,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
232 260
233 list_add_rcu(&counter->event_entry, &ctx->event_list); 261 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++; 262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
235} 265}
236 266
237/* 267/*
@@ -246,6 +276,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246 if (list_empty(&counter->list_entry)) 276 if (list_empty(&counter->list_entry))
247 return; 277 return;
248 ctx->nr_counters--; 278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
249 281
250 list_del_init(&counter->list_entry); 282 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry); 283 list_del_rcu(&counter->event_entry);
@@ -275,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
275 return; 307 return;
276 308
277 counter->state = PERF_COUNTER_STATE_INACTIVE; 309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
278 counter->tstamp_stopped = ctx->time; 314 counter->tstamp_stopped = ctx->time;
279 counter->pmu->disable(counter); 315 counter->pmu->disable(counter);
280 counter->oncpu = -1; 316 counter->oncpu = -1;
@@ -1002,6 +1038,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1002 && !ctx1->pin_count && !ctx2->pin_count; 1038 && !ctx1->pin_count && !ctx2->pin_count;
1003} 1039}
1004 1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1005/* 1116/*
1006 * Called from scheduler to remove the counters of the current task, 1117 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled. 1118 * with interrupts disabled.
@@ -1057,6 +1168,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1057 ctx->task = next; 1168 ctx->task = next;
1058 next_ctx->task = task; 1169 next_ctx->task = task;
1059 do_switch = 0; 1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1060 } 1173 }
1061 spin_unlock(&next_ctx->lock); 1174 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock); 1175 spin_unlock(&ctx->lock);
@@ -1203,7 +1316,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1203#define MAX_INTERRUPTS (~0ULL) 1316#define MAX_INTERRUPTS (~0ULL)
1204 1317
1205static void perf_log_throttle(struct perf_counter *counter, int enable); 1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1206static void perf_log_period(struct perf_counter *counter, u64 period);
1207 1319
1208static void perf_adjust_period(struct perf_counter *counter, u64 events) 1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1209{ 1321{
@@ -1222,8 +1334,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1222 if (!sample_period) 1334 if (!sample_period)
1223 sample_period = 1; 1335 sample_period = 1;
1224 1336
1225 perf_log_period(counter, sample_period);
1226
1227 hwc->sample_period = sample_period; 1337 hwc->sample_period = sample_period;
1228} 1338}
1229 1339
@@ -1283,7 +1393,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1283 if (!interrupts) { 1393 if (!interrupts) {
1284 perf_disable(); 1394 perf_disable();
1285 counter->pmu->disable(counter); 1395 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0); 1396 atomic64_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter); 1397 counter->pmu->enable(counter);
1288 perf_enable(); 1398 perf_enable();
1289 } 1399 }
@@ -1344,14 +1454,70 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1344} 1454}
1345 1455
1346/* 1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1347 * Cross CPU call to read the hardware counter 1502 * Cross CPU call to read the hardware counter
1348 */ 1503 */
1349static void __read(void *info) 1504static void __perf_counter_read(void *info)
1350{ 1505{
1506 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1351 struct perf_counter *counter = info; 1507 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx; 1508 struct perf_counter_context *ctx = counter->ctx;
1353 unsigned long flags; 1509 unsigned long flags;
1354 1510
1511 /*
1512 * If this is a task context, we need to check whether it is
1513 * the current task context of this cpu. If not it has been
1514 * scheduled out before the smp call arrived. In that case
1515 * counter->count would have been updated to a recent sample
1516 * when the counter was scheduled out.
1517 */
1518 if (ctx->task && cpuctx->task_ctx != ctx)
1519 return;
1520
1355 local_irq_save(flags); 1521 local_irq_save(flags);
1356 if (ctx->is_active) 1522 if (ctx->is_active)
1357 update_context_time(ctx); 1523 update_context_time(ctx);
@@ -1368,7 +1534,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1368 */ 1534 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1535 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu, 1536 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1); 1537 __perf_counter_read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1538 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter); 1539 update_counter_times(counter);
1374 } 1540 }
@@ -1394,7 +1560,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1394 1560
1395static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1561static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1396{ 1562{
1397 struct perf_counter_context *parent_ctx;
1398 struct perf_counter_context *ctx; 1563 struct perf_counter_context *ctx;
1399 struct perf_cpu_context *cpuctx; 1564 struct perf_cpu_context *cpuctx;
1400 struct task_struct *task; 1565 struct task_struct *task;
@@ -1454,16 +1619,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1454 retry: 1619 retry:
1455 ctx = perf_lock_task_context(task, &flags); 1620 ctx = perf_lock_task_context(task, &flags);
1456 if (ctx) { 1621 if (ctx) {
1457 parent_ctx = ctx->parent_ctx; 1622 unclone_ctx(ctx);
1458 if (parent_ctx) {
1459 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */
1461 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags); 1623 spin_unlock_irqrestore(&ctx->lock, flags);
1468 } 1624 }
1469 1625
@@ -1509,11 +1665,15 @@ static void free_counter(struct perf_counter *counter)
1509{ 1665{
1510 perf_pending_sync(counter); 1666 perf_pending_sync(counter);
1511 1667
1512 atomic_dec(&nr_counters); 1668 if (!counter->parent) {
1513 if (counter->attr.mmap) 1669 atomic_dec(&nr_counters);
1514 atomic_dec(&nr_mmap_counters); 1670 if (counter->attr.mmap)
1515 if (counter->attr.comm) 1671 atomic_dec(&nr_mmap_counters);
1516 atomic_dec(&nr_comm_counters); 1672 if (counter->attr.comm)
1673 atomic_dec(&nr_comm_counters);
1674 if (counter->attr.task)
1675 atomic_dec(&nr_task_counters);
1676 }
1517 1677
1518 if (counter->destroy) 1678 if (counter->destroy)
1519 counter->destroy(counter); 1679 counter->destroy(counter);
@@ -1547,14 +1707,133 @@ static int perf_release(struct inode *inode, struct file *file)
1547 return 0; 1707 return 0;
1548} 1708}
1549 1709
1710static int perf_counter_read_size(struct perf_counter *counter)
1711{
1712 int entry = sizeof(u64); /* value */
1713 int size = 0;
1714 int nr = 1;
1715
1716 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1717 size += sizeof(u64);
1718
1719 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1720 size += sizeof(u64);
1721
1722 if (counter->attr.read_format & PERF_FORMAT_ID)
1723 entry += sizeof(u64);
1724
1725 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1726 nr += counter->group_leader->nr_siblings;
1727 size += sizeof(u64);
1728 }
1729
1730 size += entry * nr;
1731
1732 return size;
1733}
1734
1735static u64 perf_counter_read_value(struct perf_counter *counter)
1736{
1737 struct perf_counter *child;
1738 u64 total = 0;
1739
1740 total += perf_counter_read(counter);
1741 list_for_each_entry(child, &counter->child_list, child_list)
1742 total += perf_counter_read(child);
1743
1744 return total;
1745}
1746
1747static int perf_counter_read_entry(struct perf_counter *counter,
1748 u64 read_format, char __user *buf)
1749{
1750 int n = 0, count = 0;
1751 u64 values[2];
1752
1753 values[n++] = perf_counter_read_value(counter);
1754 if (read_format & PERF_FORMAT_ID)
1755 values[n++] = primary_counter_id(counter);
1756
1757 count = n * sizeof(u64);
1758
1759 if (copy_to_user(buf, values, count))
1760 return -EFAULT;
1761
1762 return count;
1763}
1764
1765static int perf_counter_read_group(struct perf_counter *counter,
1766 u64 read_format, char __user *buf)
1767{
1768 struct perf_counter *leader = counter->group_leader, *sub;
1769 int n = 0, size = 0, err = -EFAULT;
1770 u64 values[3];
1771
1772 values[n++] = 1 + leader->nr_siblings;
1773 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1774 values[n++] = leader->total_time_enabled +
1775 atomic64_read(&leader->child_total_time_enabled);
1776 }
1777 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1778 values[n++] = leader->total_time_running +
1779 atomic64_read(&leader->child_total_time_running);
1780 }
1781
1782 size = n * sizeof(u64);
1783
1784 if (copy_to_user(buf, values, size))
1785 return -EFAULT;
1786
1787 err = perf_counter_read_entry(leader, read_format, buf + size);
1788 if (err < 0)
1789 return err;
1790
1791 size += err;
1792
1793 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1794 err = perf_counter_read_entry(sub, read_format,
1795 buf + size);
1796 if (err < 0)
1797 return err;
1798
1799 size += err;
1800 }
1801
1802 return size;
1803}
1804
1805static int perf_counter_read_one(struct perf_counter *counter,
1806 u64 read_format, char __user *buf)
1807{
1808 u64 values[4];
1809 int n = 0;
1810
1811 values[n++] = perf_counter_read_value(counter);
1812 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813 values[n++] = counter->total_time_enabled +
1814 atomic64_read(&counter->child_total_time_enabled);
1815 }
1816 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817 values[n++] = counter->total_time_running +
1818 atomic64_read(&counter->child_total_time_running);
1819 }
1820 if (read_format & PERF_FORMAT_ID)
1821 values[n++] = primary_counter_id(counter);
1822
1823 if (copy_to_user(buf, values, n * sizeof(u64)))
1824 return -EFAULT;
1825
1826 return n * sizeof(u64);
1827}
1828
1550/* 1829/*
1551 * Read the performance counter - simple non blocking version for now 1830 * Read the performance counter - simple non blocking version for now
1552 */ 1831 */
1553static ssize_t 1832static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1833perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{ 1834{
1556 u64 values[3]; 1835 u64 read_format = counter->attr.read_format;
1557 int n; 1836 int ret;
1558 1837
1559 /* 1838 /*
1560 * Return end-of-file for a read on a counter that is in 1839 * Return end-of-file for a read on a counter that is in
@@ -1564,28 +1843,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1564 if (counter->state == PERF_COUNTER_STATE_ERROR) 1843 if (counter->state == PERF_COUNTER_STATE_ERROR)
1565 return 0; 1844 return 0;
1566 1845
1846 if (count < perf_counter_read_size(counter))
1847 return -ENOSPC;
1848
1567 WARN_ON_ONCE(counter->ctx->parent_ctx); 1849 WARN_ON_ONCE(counter->ctx->parent_ctx);
1568 mutex_lock(&counter->child_mutex); 1850 mutex_lock(&counter->child_mutex);
1569 values[0] = perf_counter_read(counter); 1851 if (read_format & PERF_FORMAT_GROUP)
1570 n = 1; 1852 ret = perf_counter_read_group(counter, read_format, buf);
1571 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1853 else
1572 values[n++] = counter->total_time_enabled + 1854 ret = perf_counter_read_one(counter, read_format, buf);
1573 atomic64_read(&counter->child_total_time_enabled);
1574 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1575 values[n++] = counter->total_time_running +
1576 atomic64_read(&counter->child_total_time_running);
1577 if (counter->attr.read_format & PERF_FORMAT_ID)
1578 values[n++] = counter->id;
1579 mutex_unlock(&counter->child_mutex); 1855 mutex_unlock(&counter->child_mutex);
1580 1856
1581 if (count < n * sizeof(u64)) 1857 return ret;
1582 return -EINVAL;
1583 count = n * sizeof(u64);
1584
1585 if (copy_to_user(buf, values, count))
1586 return -EFAULT;
1587
1588 return count;
1589} 1858}
1590 1859
1591static ssize_t 1860static ssize_t
@@ -1620,22 +1889,6 @@ static void perf_counter_reset(struct perf_counter *counter)
1620 perf_counter_update_userpage(counter); 1889 perf_counter_update_userpage(counter);
1621} 1890}
1622 1891
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/* 1892/*
1640 * Holding the top-level counter's child_mutex means that any 1893 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block 1894 * descendant process that has inherited this counter will block
@@ -1658,14 +1911,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
1658static void perf_counter_for_each(struct perf_counter *counter, 1911static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *)) 1912 void (*func)(struct perf_counter *))
1660{ 1913{
1661 struct perf_counter *child; 1914 struct perf_counter_context *ctx = counter->ctx;
1915 struct perf_counter *sibling;
1662 1916
1663 WARN_ON_ONCE(counter->ctx->parent_ctx); 1917 WARN_ON_ONCE(ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex); 1918 mutex_lock(&ctx->mutex);
1665 perf_counter_for_each_sibling(counter, func); 1919 counter = counter->group_leader;
1666 list_for_each_entry(child, &counter->child_list, child_list) 1920
1667 perf_counter_for_each_sibling(child, func); 1921 perf_counter_for_each_child(counter, func);
1668 mutex_unlock(&counter->child_mutex); 1922 func(counter);
1923 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1924 perf_counter_for_each_child(counter, func);
1925 mutex_unlock(&ctx->mutex);
1669} 1926}
1670 1927
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) 1928static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1694,8 +1951,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1694 1951
1695 counter->attr.sample_freq = value; 1952 counter->attr.sample_freq = value;
1696 } else { 1953 } else {
1697 perf_log_period(counter, value);
1698
1699 counter->attr.sample_period = value; 1954 counter->attr.sample_period = value;
1700 counter->hw.sample_period = value; 1955 counter->hw.sample_period = value;
1701 } 1956 }
@@ -1764,6 +2019,18 @@ int perf_counter_task_disable(void)
1764 return 0; 2019 return 0;
1765} 2020}
1766 2021
2022#ifndef PERF_COUNTER_INDEX_OFFSET
2023# define PERF_COUNTER_INDEX_OFFSET 0
2024#endif
2025
2026static int perf_counter_index(struct perf_counter *counter)
2027{
2028 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2029 return 0;
2030
2031 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2032}
2033
1767/* 2034/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise 2035 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch 2036 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1788,11 +2055,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1788 preempt_disable(); 2055 preempt_disable();
1789 ++userpg->lock; 2056 ++userpg->lock;
1790 barrier(); 2057 barrier();
1791 userpg->index = counter->hw.idx; 2058 userpg->index = perf_counter_index(counter);
1792 userpg->offset = atomic64_read(&counter->count); 2059 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 2060 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count); 2061 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795 2062
2063 userpg->time_enabled = counter->total_time_enabled +
2064 atomic64_read(&counter->child_total_time_enabled);
2065
2066 userpg->time_running = counter->total_time_running +
2067 atomic64_read(&counter->child_total_time_running);
2068
1796 barrier(); 2069 barrier();
1797 ++userpg->lock; 2070 ++userpg->lock;
1798 preempt_enable(); 2071 preempt_enable();
@@ -1806,6 +2079,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1806 struct perf_mmap_data *data; 2079 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS; 2080 int ret = VM_FAULT_SIGBUS;
1808 2081
2082 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2083 if (vmf->pgoff == 0)
2084 ret = 0;
2085 return ret;
2086 }
2087
1809 rcu_read_lock(); 2088 rcu_read_lock();
1810 data = rcu_dereference(counter->data); 2089 data = rcu_dereference(counter->data);
1811 if (!data) 2090 if (!data)
@@ -1819,9 +2098,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1819 if ((unsigned)nr > data->nr_pages) 2098 if ((unsigned)nr > data->nr_pages)
1820 goto unlock; 2099 goto unlock;
1821 2100
2101 if (vmf->flags & FAULT_FLAG_WRITE)
2102 goto unlock;
2103
1822 vmf->page = virt_to_page(data->data_pages[nr]); 2104 vmf->page = virt_to_page(data->data_pages[nr]);
1823 } 2105 }
2106
1824 get_page(vmf->page); 2107 get_page(vmf->page);
2108 vmf->page->mapping = vma->vm_file->f_mapping;
2109 vmf->page->index = vmf->pgoff;
2110
1825 ret = 0; 2111 ret = 0;
1826unlock: 2112unlock:
1827 rcu_read_unlock(); 2113 rcu_read_unlock();
@@ -1874,6 +2160,14 @@ fail:
1874 return -ENOMEM; 2160 return -ENOMEM;
1875} 2161}
1876 2162
2163static void perf_mmap_free_page(unsigned long addr)
2164{
2165 struct page *page = virt_to_page((void *)addr);
2166
2167 page->mapping = NULL;
2168 __free_page(page);
2169}
2170
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2171static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{ 2172{
1879 struct perf_mmap_data *data; 2173 struct perf_mmap_data *data;
@@ -1881,9 +2175,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1881 2175
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2176 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883 2177
1884 free_page((unsigned long)data->user_page); 2178 perf_mmap_free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++) 2179 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]); 2180 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2181
1887 kfree(data); 2182 kfree(data);
1888} 2183}
1889 2184
@@ -1920,9 +2215,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1920} 2215}
1921 2216
1922static struct vm_operations_struct perf_mmap_vmops = { 2217static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open, 2218 .open = perf_mmap_open,
1924 .close = perf_mmap_close, 2219 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault, 2220 .fault = perf_mmap_fault,
2221 .page_mkwrite = perf_mmap_fault,
1926}; 2222};
1927 2223
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma) 2224static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +2232,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1936 long user_extra, extra; 2232 long user_extra, extra;
1937 int ret = 0; 2233 int ret = 0;
1938 2234
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 2235 if (!(vma->vm_flags & VM_SHARED))
1940 return -EINVAL; 2236 return -EINVAL;
1941 2237
1942 vma_size = vma->vm_end - vma->vm_start; 2238 vma_size = vma->vm_end - vma->vm_start;
@@ -1995,10 +2291,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1995 atomic_long_add(user_extra, &user->locked_vm); 2291 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra; 2292 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra; 2293 counter->data->nr_locked = extra;
2294 if (vma->vm_flags & VM_WRITE)
2295 counter->data->writable = 1;
2296
1998unlock: 2297unlock:
1999 mutex_unlock(&counter->mmap_mutex); 2298 mutex_unlock(&counter->mmap_mutex);
2000 2299
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED; 2300 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops; 2301 vma->vm_ops = &perf_mmap_vmops;
2004 2302
@@ -2064,7 +2362,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2064 2362
2065 if (counter->pending_disable) { 2363 if (counter->pending_disable) {
2066 counter->pending_disable = 0; 2364 counter->pending_disable = 0;
2067 perf_counter_disable(counter); 2365 __perf_counter_disable(counter);
2068 } 2366 }
2069 2367
2070 if (counter->pending_wakeup) { 2368 if (counter->pending_wakeup) {
@@ -2175,11 +2473,38 @@ struct perf_output_handle {
2175 unsigned long head; 2473 unsigned long head;
2176 unsigned long offset; 2474 unsigned long offset;
2177 int nmi; 2475 int nmi;
2178 int overflow; 2476 int sample;
2179 int locked; 2477 int locked;
2180 unsigned long flags; 2478 unsigned long flags;
2181}; 2479};
2182 2480
2481static bool perf_output_space(struct perf_mmap_data *data,
2482 unsigned int offset, unsigned int head)
2483{
2484 unsigned long tail;
2485 unsigned long mask;
2486
2487 if (!data->writable)
2488 return true;
2489
2490 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2491 /*
2492 * Userspace could choose to issue a mb() before updating the tail
2493 * pointer. So that all reads will be completed before the write is
2494 * issued.
2495 */
2496 tail = ACCESS_ONCE(data->user_page->data_tail);
2497 smp_rmb();
2498
2499 offset = (offset - tail) & mask;
2500 head = (head - tail) & mask;
2501
2502 if ((int)(head - offset) < 0)
2503 return false;
2504
2505 return true;
2506}
2507
2183static void perf_output_wakeup(struct perf_output_handle *handle) 2508static void perf_output_wakeup(struct perf_output_handle *handle)
2184{ 2509{
2185 atomic_set(&handle->data->poll, POLL_IN); 2510 atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,12 +2595,57 @@ out:
2270 local_irq_restore(handle->flags); 2595 local_irq_restore(handle->flags);
2271} 2596}
2272 2597
2598static void perf_output_copy(struct perf_output_handle *handle,
2599 const void *buf, unsigned int len)
2600{
2601 unsigned int pages_mask;
2602 unsigned int offset;
2603 unsigned int size;
2604 void **pages;
2605
2606 offset = handle->offset;
2607 pages_mask = handle->data->nr_pages - 1;
2608 pages = handle->data->data_pages;
2609
2610 do {
2611 unsigned int page_offset;
2612 int nr;
2613
2614 nr = (offset >> PAGE_SHIFT) & pages_mask;
2615 page_offset = offset & (PAGE_SIZE - 1);
2616 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2617
2618 memcpy(pages[nr] + page_offset, buf, size);
2619
2620 len -= size;
2621 buf += size;
2622 offset += size;
2623 } while (len);
2624
2625 handle->offset = offset;
2626
2627 /*
2628 * Check we didn't copy past our reservation window, taking the
2629 * possible unsigned int wrap into account.
2630 */
2631 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2632}
2633
2634#define perf_output_put(handle, x) \
2635 perf_output_copy((handle), &(x), sizeof(x))
2636
2273static int perf_output_begin(struct perf_output_handle *handle, 2637static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size, 2638 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow) 2639 int nmi, int sample)
2276{ 2640{
2277 struct perf_mmap_data *data; 2641 struct perf_mmap_data *data;
2278 unsigned int offset, head; 2642 unsigned int offset, head;
2643 int have_lost;
2644 struct {
2645 struct perf_event_header header;
2646 u64 id;
2647 u64 lost;
2648 } lost_event;
2279 2649
2280 /* 2650 /*
2281 * For inherited counters we send all the output towards the parent. 2651 * For inherited counters we send all the output towards the parent.
@@ -2288,19 +2658,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
2288 if (!data) 2658 if (!data)
2289 goto out; 2659 goto out;
2290 2660
2291 handle->data = data; 2661 handle->data = data;
2292 handle->counter = counter; 2662 handle->counter = counter;
2293 handle->nmi = nmi; 2663 handle->nmi = nmi;
2294 handle->overflow = overflow; 2664 handle->sample = sample;
2295 2665
2296 if (!data->nr_pages) 2666 if (!data->nr_pages)
2297 goto fail; 2667 goto fail;
2298 2668
2669 have_lost = atomic_read(&data->lost);
2670 if (have_lost)
2671 size += sizeof(lost_event);
2672
2299 perf_output_lock(handle); 2673 perf_output_lock(handle);
2300 2674
2301 do { 2675 do {
2302 offset = head = atomic_long_read(&data->head); 2676 offset = head = atomic_long_read(&data->head);
2303 head += size; 2677 head += size;
2678 if (unlikely(!perf_output_space(data, offset, head)))
2679 goto fail;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2680 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305 2681
2306 handle->offset = offset; 2682 handle->offset = offset;
@@ -2309,55 +2685,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2685 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1); 2686 atomic_set(&data->wakeup, 1);
2311 2687
2688 if (have_lost) {
2689 lost_event.header.type = PERF_EVENT_LOST;
2690 lost_event.header.misc = 0;
2691 lost_event.header.size = sizeof(lost_event);
2692 lost_event.id = counter->id;
2693 lost_event.lost = atomic_xchg(&data->lost, 0);
2694
2695 perf_output_put(handle, lost_event);
2696 }
2697
2312 return 0; 2698 return 0;
2313 2699
2314fail: 2700fail:
2315 perf_output_wakeup(handle); 2701 atomic_inc(&data->lost);
2702 perf_output_unlock(handle);
2316out: 2703out:
2317 rcu_read_unlock(); 2704 rcu_read_unlock();
2318 2705
2319 return -ENOSPC; 2706 return -ENOSPC;
2320} 2707}
2321 2708
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle) 2709static void perf_output_end(struct perf_output_handle *handle)
2362{ 2710{
2363 struct perf_counter *counter = handle->counter; 2711 struct perf_counter *counter = handle->counter;
@@ -2365,7 +2713,7 @@ static void perf_output_end(struct perf_output_handle *handle)
2365 2713
2366 int wakeup_events = counter->attr.wakeup_events; 2714 int wakeup_events = counter->attr.wakeup_events;
2367 2715
2368 if (handle->overflow && wakeup_events) { 2716 if (handle->sample && wakeup_events) {
2369 int events = atomic_inc_return(&data->events); 2717 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) { 2718 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events); 2719 atomic_sub(wakeup_events, &data->events);
@@ -2399,7 +2747,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2399 return task_pid_nr_ns(p, counter->ns); 2747 return task_pid_nr_ns(p, counter->ns);
2400} 2748}
2401 2749
2402static void perf_counter_output(struct perf_counter *counter, int nmi, 2750static void perf_output_read_one(struct perf_output_handle *handle,
2751 struct perf_counter *counter)
2752{
2753 u64 read_format = counter->attr.read_format;
2754 u64 values[4];
2755 int n = 0;
2756
2757 values[n++] = atomic64_read(&counter->count);
2758 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2759 values[n++] = counter->total_time_enabled +
2760 atomic64_read(&counter->child_total_time_enabled);
2761 }
2762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2763 values[n++] = counter->total_time_running +
2764 atomic64_read(&counter->child_total_time_running);
2765 }
2766 if (read_format & PERF_FORMAT_ID)
2767 values[n++] = primary_counter_id(counter);
2768
2769 perf_output_copy(handle, values, n * sizeof(u64));
2770}
2771
2772/*
2773 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2774 */
2775static void perf_output_read_group(struct perf_output_handle *handle,
2776 struct perf_counter *counter)
2777{
2778 struct perf_counter *leader = counter->group_leader, *sub;
2779 u64 read_format = counter->attr.read_format;
2780 u64 values[5];
2781 int n = 0;
2782
2783 values[n++] = 1 + leader->nr_siblings;
2784
2785 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2786 values[n++] = leader->total_time_enabled;
2787
2788 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2789 values[n++] = leader->total_time_running;
2790
2791 if (leader != counter)
2792 leader->pmu->read(leader);
2793
2794 values[n++] = atomic64_read(&leader->count);
2795 if (read_format & PERF_FORMAT_ID)
2796 values[n++] = primary_counter_id(leader);
2797
2798 perf_output_copy(handle, values, n * sizeof(u64));
2799
2800 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2801 n = 0;
2802
2803 if (sub != counter)
2804 sub->pmu->read(sub);
2805
2806 values[n++] = atomic64_read(&sub->count);
2807 if (read_format & PERF_FORMAT_ID)
2808 values[n++] = primary_counter_id(sub);
2809
2810 perf_output_copy(handle, values, n * sizeof(u64));
2811 }
2812}
2813
2814static void perf_output_read(struct perf_output_handle *handle,
2815 struct perf_counter *counter)
2816{
2817 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2818 perf_output_read_group(handle, counter);
2819 else
2820 perf_output_read_one(handle, counter);
2821}
2822
2823void perf_counter_output(struct perf_counter *counter, int nmi,
2403 struct perf_sample_data *data) 2824 struct perf_sample_data *data)
2404{ 2825{
2405 int ret; 2826 int ret;
@@ -2410,10 +2831,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2410 struct { 2831 struct {
2411 u32 pid, tid; 2832 u32 pid, tid;
2412 } tid_entry; 2833 } tid_entry;
2413 struct {
2414 u64 id;
2415 u64 counter;
2416 } group_entry;
2417 struct perf_callchain_entry *callchain = NULL; 2834 struct perf_callchain_entry *callchain = NULL;
2418 int callchain_size = 0; 2835 int callchain_size = 0;
2419 u64 time; 2836 u64 time;
@@ -2421,15 +2838,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2421 u32 cpu, reserved; 2838 u32 cpu, reserved;
2422 } cpu_entry; 2839 } cpu_entry;
2423 2840
2424 header.type = 0; 2841 header.type = PERF_EVENT_SAMPLE;
2425 header.size = sizeof(header); 2842 header.size = sizeof(header);
2426 2843
2427 header.misc = PERF_EVENT_MISC_OVERFLOW; 2844 header.misc = 0;
2428 header.misc |= perf_misc_flags(data->regs); 2845 header.misc |= perf_misc_flags(data->regs);
2429 2846
2430 if (sample_type & PERF_SAMPLE_IP) { 2847 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs); 2848 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip); 2849 header.size += sizeof(ip);
2434 } 2850 }
2435 2851
@@ -2438,7 +2854,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2438 tid_entry.pid = perf_counter_pid(counter, current); 2854 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current); 2855 tid_entry.tid = perf_counter_tid(counter, current);
2440 2856
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry); 2857 header.size += sizeof(tid_entry);
2443 } 2858 }
2444 2859
@@ -2448,47 +2863,51 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2448 */ 2863 */
2449 time = sched_clock(); 2864 time = sched_clock();
2450 2865
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64); 2866 header.size += sizeof(u64);
2453 } 2867 }
2454 2868
2455 if (sample_type & PERF_SAMPLE_ADDR) { 2869 if (sample_type & PERF_SAMPLE_ADDR)
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64); 2870 header.size += sizeof(u64);
2458 }
2459 2871
2460 if (sample_type & PERF_SAMPLE_ID) { 2872 if (sample_type & PERF_SAMPLE_ID)
2461 header.type |= PERF_SAMPLE_ID; 2873 header.size += sizeof(u64);
2874
2875 if (sample_type & PERF_SAMPLE_STREAM_ID)
2462 header.size += sizeof(u64); 2876 header.size += sizeof(u64);
2463 }
2464 2877
2465 if (sample_type & PERF_SAMPLE_CPU) { 2878 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry); 2879 header.size += sizeof(cpu_entry);
2468 2880
2469 cpu_entry.cpu = raw_smp_processor_id(); 2881 cpu_entry.cpu = raw_smp_processor_id();
2882 cpu_entry.reserved = 0;
2470 } 2883 }
2471 2884
2472 if (sample_type & PERF_SAMPLE_PERIOD) { 2885 if (sample_type & PERF_SAMPLE_PERIOD)
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64); 2886 header.size += sizeof(u64);
2475 }
2476 2887
2477 if (sample_type & PERF_SAMPLE_GROUP) { 2888 if (sample_type & PERF_SAMPLE_READ)
2478 header.type |= PERF_SAMPLE_GROUP; 2889 header.size += perf_counter_read_size(counter);
2479 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry);
2481 }
2482 2890
2483 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2891 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2484 callchain = perf_callchain(data->regs); 2892 callchain = perf_callchain(data->regs);
2485 2893
2486 if (callchain) { 2894 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64); 2895 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size; 2896 header.size += callchain_size;
2491 } 2897 } else
2898 header.size += sizeof(u64);
2899 }
2900
2901 if (sample_type & PERF_SAMPLE_RAW) {
2902 int size = sizeof(u32);
2903
2904 if (data->raw)
2905 size += data->raw->size;
2906 else
2907 size += sizeof(u32);
2908
2909 WARN_ON_ONCE(size & (sizeof(u64)-1));
2910 header.size += size;
2492 } 2911 }
2493 2912
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2913 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2509,7 +2928,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2509 if (sample_type & PERF_SAMPLE_ADDR) 2928 if (sample_type & PERF_SAMPLE_ADDR)
2510 perf_output_put(&handle, data->addr); 2929 perf_output_put(&handle, data->addr);
2511 2930
2512 if (sample_type & PERF_SAMPLE_ID) 2931 if (sample_type & PERF_SAMPLE_ID) {
2932 u64 id = primary_counter_id(counter);
2933
2934 perf_output_put(&handle, id);
2935 }
2936
2937 if (sample_type & PERF_SAMPLE_STREAM_ID)
2513 perf_output_put(&handle, counter->id); 2938 perf_output_put(&handle, counter->id);
2514 2939
2515 if (sample_type & PERF_SAMPLE_CPU) 2940 if (sample_type & PERF_SAMPLE_CPU)
@@ -2518,76 +2943,125 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2518 if (sample_type & PERF_SAMPLE_PERIOD) 2943 if (sample_type & PERF_SAMPLE_PERIOD)
2519 perf_output_put(&handle, data->period); 2944 perf_output_put(&handle, data->period);
2520 2945
2521 /* 2946 if (sample_type & PERF_SAMPLE_READ)
2522 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2947 perf_output_read(&handle, counter);
2523 */
2524 if (sample_type & PERF_SAMPLE_GROUP) {
2525 struct perf_counter *leader, *sub;
2526 u64 nr = counter->nr_siblings;
2527
2528 perf_output_put(&handle, nr);
2529
2530 leader = counter->group_leader;
2531 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2532 if (sub != counter)
2533 sub->pmu->read(sub);
2534 2948
2535 group_entry.id = sub->id; 2949 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2536 group_entry.counter = atomic64_read(&sub->count); 2950 if (callchain)
2951 perf_output_copy(&handle, callchain, callchain_size);
2952 else {
2953 u64 nr = 0;
2954 perf_output_put(&handle, nr);
2955 }
2956 }
2537 2957
2538 perf_output_put(&handle, group_entry); 2958 if (sample_type & PERF_SAMPLE_RAW) {
2959 if (data->raw) {
2960 perf_output_put(&handle, data->raw->size);
2961 perf_output_copy(&handle, data->raw->data, data->raw->size);
2962 } else {
2963 struct {
2964 u32 size;
2965 u32 data;
2966 } raw = {
2967 .size = sizeof(u32),
2968 .data = 0,
2969 };
2970 perf_output_put(&handle, raw);
2539 } 2971 }
2540 } 2972 }
2541 2973
2542 if (callchain) 2974 perf_output_end(&handle);
2543 perf_output_copy(&handle, callchain, callchain_size); 2975}
2976
2977/*
2978 * read event
2979 */
2980
2981struct perf_read_event {
2982 struct perf_event_header header;
2983
2984 u32 pid;
2985 u32 tid;
2986};
2987
2988static void
2989perf_counter_read_event(struct perf_counter *counter,
2990 struct task_struct *task)
2991{
2992 struct perf_output_handle handle;
2993 struct perf_read_event event = {
2994 .header = {
2995 .type = PERF_EVENT_READ,
2996 .misc = 0,
2997 .size = sizeof(event) + perf_counter_read_size(counter),
2998 },
2999 .pid = perf_counter_pid(counter, task),
3000 .tid = perf_counter_tid(counter, task),
3001 };
3002 int ret;
3003
3004 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3005 if (ret)
3006 return;
3007
3008 perf_output_put(&handle, event);
3009 perf_output_read(&handle, counter);
2544 3010
2545 perf_output_end(&handle); 3011 perf_output_end(&handle);
2546} 3012}
2547 3013
2548/* 3014/*
2549 * fork tracking 3015 * task tracking -- fork/exit
3016 *
3017 * enabled by: attr.comm | attr.mmap | attr.task
2550 */ 3018 */
2551 3019
2552struct perf_fork_event { 3020struct perf_task_event {
2553 struct task_struct *task; 3021 struct task_struct *task;
3022 struct perf_counter_context *task_ctx;
2554 3023
2555 struct { 3024 struct {
2556 struct perf_event_header header; 3025 struct perf_event_header header;
2557 3026
2558 u32 pid; 3027 u32 pid;
2559 u32 ppid; 3028 u32 ppid;
3029 u32 tid;
3030 u32 ptid;
2560 } event; 3031 } event;
2561}; 3032};
2562 3033
2563static void perf_counter_fork_output(struct perf_counter *counter, 3034static void perf_counter_task_output(struct perf_counter *counter,
2564 struct perf_fork_event *fork_event) 3035 struct perf_task_event *task_event)
2565{ 3036{
2566 struct perf_output_handle handle; 3037 struct perf_output_handle handle;
2567 int size = fork_event->event.header.size; 3038 int size = task_event->event.header.size;
2568 struct task_struct *task = fork_event->task; 3039 struct task_struct *task = task_event->task;
2569 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3040 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2570 3041
2571 if (ret) 3042 if (ret)
2572 return; 3043 return;
2573 3044
2574 fork_event->event.pid = perf_counter_pid(counter, task); 3045 task_event->event.pid = perf_counter_pid(counter, task);
2575 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3046 task_event->event.ppid = perf_counter_pid(counter, current);
3047
3048 task_event->event.tid = perf_counter_tid(counter, task);
3049 task_event->event.ptid = perf_counter_tid(counter, current);
2576 3050
2577 perf_output_put(&handle, fork_event->event); 3051 perf_output_put(&handle, task_event->event);
2578 perf_output_end(&handle); 3052 perf_output_end(&handle);
2579} 3053}
2580 3054
2581static int perf_counter_fork_match(struct perf_counter *counter) 3055static int perf_counter_task_match(struct perf_counter *counter)
2582{ 3056{
2583 if (counter->attr.comm || counter->attr.mmap) 3057 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2584 return 1; 3058 return 1;
2585 3059
2586 return 0; 3060 return 0;
2587} 3061}
2588 3062
2589static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3063static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2590 struct perf_fork_event *fork_event) 3064 struct perf_task_event *task_event)
2591{ 3065{
2592 struct perf_counter *counter; 3066 struct perf_counter *counter;
2593 3067
@@ -2596,51 +3070,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2596 3070
2597 rcu_read_lock(); 3071 rcu_read_lock();
2598 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3072 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2599 if (perf_counter_fork_match(counter)) 3073 if (perf_counter_task_match(counter))
2600 perf_counter_fork_output(counter, fork_event); 3074 perf_counter_task_output(counter, task_event);
2601 } 3075 }
2602 rcu_read_unlock(); 3076 rcu_read_unlock();
2603} 3077}
2604 3078
2605static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3079static void perf_counter_task_event(struct perf_task_event *task_event)
2606{ 3080{
2607 struct perf_cpu_context *cpuctx; 3081 struct perf_cpu_context *cpuctx;
2608 struct perf_counter_context *ctx; 3082 struct perf_counter_context *ctx = task_event->task_ctx;
2609 3083
2610 cpuctx = &get_cpu_var(perf_cpu_context); 3084 cpuctx = &get_cpu_var(perf_cpu_context);
2611 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3085 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2612 put_cpu_var(perf_cpu_context); 3086 put_cpu_var(perf_cpu_context);
2613 3087
2614 rcu_read_lock(); 3088 rcu_read_lock();
2615 /* 3089 if (!ctx)
2616 * doesn't really matter which of the child contexts the 3090 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2617 * events ends up in.
2618 */
2619 ctx = rcu_dereference(current->perf_counter_ctxp);
2620 if (ctx) 3091 if (ctx)
2621 perf_counter_fork_ctx(ctx, fork_event); 3092 perf_counter_task_ctx(ctx, task_event);
2622 rcu_read_unlock(); 3093 rcu_read_unlock();
2623} 3094}
2624 3095
2625void perf_counter_fork(struct task_struct *task) 3096static void perf_counter_task(struct task_struct *task,
3097 struct perf_counter_context *task_ctx,
3098 int new)
2626{ 3099{
2627 struct perf_fork_event fork_event; 3100 struct perf_task_event task_event;
2628 3101
2629 if (!atomic_read(&nr_comm_counters) && 3102 if (!atomic_read(&nr_comm_counters) &&
2630 !atomic_read(&nr_mmap_counters)) 3103 !atomic_read(&nr_mmap_counters) &&
3104 !atomic_read(&nr_task_counters))
2631 return; 3105 return;
2632 3106
2633 fork_event = (struct perf_fork_event){ 3107 task_event = (struct perf_task_event){
2634 .task = task, 3108 .task = task,
2635 .event = { 3109 .task_ctx = task_ctx,
3110 .event = {
2636 .header = { 3111 .header = {
2637 .type = PERF_EVENT_FORK, 3112 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2638 .size = sizeof(fork_event.event), 3113 .misc = 0,
3114 .size = sizeof(task_event.event),
2639 }, 3115 },
3116 /* .pid */
3117 /* .ppid */
3118 /* .tid */
3119 /* .ptid */
2640 }, 3120 },
2641 }; 3121 };
2642 3122
2643 perf_counter_fork_event(&fork_event); 3123 perf_counter_task_event(&task_event);
3124}
3125
3126void perf_counter_fork(struct task_struct *task)
3127{
3128 perf_counter_task(task, NULL, 1);
2644} 3129}
2645 3130
2646/* 3131/*
@@ -2708,8 +3193,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2708 struct perf_cpu_context *cpuctx; 3193 struct perf_cpu_context *cpuctx;
2709 struct perf_counter_context *ctx; 3194 struct perf_counter_context *ctx;
2710 unsigned int size; 3195 unsigned int size;
2711 char *comm = comm_event->task->comm; 3196 char comm[TASK_COMM_LEN];
2712 3197
3198 memset(comm, 0, sizeof(comm));
3199 strncpy(comm, comm_event->task->comm, sizeof(comm));
2713 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3200 size = ALIGN(strlen(comm)+1, sizeof(u64));
2714 3201
2715 comm_event->comm = comm; 3202 comm_event->comm = comm;
@@ -2736,13 +3223,24 @@ void perf_counter_comm(struct task_struct *task)
2736{ 3223{
2737 struct perf_comm_event comm_event; 3224 struct perf_comm_event comm_event;
2738 3225
3226 if (task->perf_counter_ctxp)
3227 perf_counter_enable_on_exec(task);
3228
2739 if (!atomic_read(&nr_comm_counters)) 3229 if (!atomic_read(&nr_comm_counters))
2740 return; 3230 return;
2741 3231
2742 comm_event = (struct perf_comm_event){ 3232 comm_event = (struct perf_comm_event){
2743 .task = task, 3233 .task = task,
3234 /* .comm */
3235 /* .comm_size */
2744 .event = { 3236 .event = {
2745 .header = { .type = PERF_EVENT_COMM, }, 3237 .header = {
3238 .type = PERF_EVENT_COMM,
3239 .misc = 0,
3240 /* .size */
3241 },
3242 /* .pid */
3243 /* .tid */
2746 }, 3244 },
2747 }; 3245 };
2748 3246
@@ -2825,8 +3323,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2825 char *buf = NULL; 3323 char *buf = NULL;
2826 const char *name; 3324 const char *name;
2827 3325
3326 memset(tmp, 0, sizeof(tmp));
3327
2828 if (file) { 3328 if (file) {
2829 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3329 /*
3330 * d_path works from the end of the buffer backwards, so we
3331 * need to add enough zero bytes after the string to handle
3332 * the 64bit alignment we do later.
3333 */
3334 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
2830 if (!buf) { 3335 if (!buf) {
2831 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3336 name = strncpy(tmp, "//enomem", sizeof(tmp));
2832 goto got_name; 3337 goto got_name;
@@ -2837,9 +3342,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2837 goto got_name; 3342 goto got_name;
2838 } 3343 }
2839 } else { 3344 } else {
2840 name = arch_vma_name(mmap_event->vma); 3345 if (arch_vma_name(mmap_event->vma)) {
2841 if (name) 3346 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3347 sizeof(tmp));
2842 goto got_name; 3348 goto got_name;
3349 }
2843 3350
2844 if (!vma->vm_mm) { 3351 if (!vma->vm_mm) {
2845 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3352 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -2884,8 +3391,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2884 3391
2885 mmap_event = (struct perf_mmap_event){ 3392 mmap_event = (struct perf_mmap_event){
2886 .vma = vma, 3393 .vma = vma,
3394 /* .file_name */
3395 /* .file_size */
2887 .event = { 3396 .event = {
2888 .header = { .type = PERF_EVENT_MMAP, }, 3397 .header = {
3398 .type = PERF_EVENT_MMAP,
3399 .misc = 0,
3400 /* .size */
3401 },
3402 /* .pid */
3403 /* .tid */
2889 .start = vma->vm_start, 3404 .start = vma->vm_start,
2890 .len = vma->vm_end - vma->vm_start, 3405 .len = vma->vm_end - vma->vm_start,
2891 .pgoff = vma->vm_pgoff, 3406 .pgoff = vma->vm_pgoff,
@@ -2896,49 +3411,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2896} 3411}
2897 3412
2898/* 3413/*
2899 * Log sample_period changes so that analyzing tools can re-normalize the
2900 * event flow.
2901 */
2902
2903struct freq_event {
2904 struct perf_event_header header;
2905 u64 time;
2906 u64 id;
2907 u64 period;
2908};
2909
2910static void perf_log_period(struct perf_counter *counter, u64 period)
2911{
2912 struct perf_output_handle handle;
2913 struct freq_event event;
2914 int ret;
2915
2916 if (counter->hw.sample_period == period)
2917 return;
2918
2919 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2920 return;
2921
2922 event = (struct freq_event) {
2923 .header = {
2924 .type = PERF_EVENT_PERIOD,
2925 .misc = 0,
2926 .size = sizeof(event),
2927 },
2928 .time = sched_clock(),
2929 .id = counter->id,
2930 .period = period,
2931 };
2932
2933 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2934 if (ret)
2935 return;
2936
2937 perf_output_put(&handle, event);
2938 perf_output_end(&handle);
2939}
2940
2941/*
2942 * IRQ throttle logging 3414 * IRQ throttle logging
2943 */ 3415 */
2944 3416
@@ -2951,16 +3423,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2951 struct perf_event_header header; 3423 struct perf_event_header header;
2952 u64 time; 3424 u64 time;
2953 u64 id; 3425 u64 id;
3426 u64 stream_id;
2954 } throttle_event = { 3427 } throttle_event = {
2955 .header = { 3428 .header = {
2956 .type = PERF_EVENT_THROTTLE + 1, 3429 .type = PERF_EVENT_THROTTLE,
2957 .misc = 0, 3430 .misc = 0,
2958 .size = sizeof(throttle_event), 3431 .size = sizeof(throttle_event),
2959 }, 3432 },
2960 .time = sched_clock(), 3433 .time = sched_clock(),
2961 .id = counter->id, 3434 .id = primary_counter_id(counter),
3435 .stream_id = counter->id,
2962 }; 3436 };
2963 3437
3438 if (enable)
3439 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3440
2964 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3441 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
2965 if (ret) 3442 if (ret)
2966 return; 3443 return;
@@ -2970,7 +3447,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2970} 3447}
2971 3448
2972/* 3449/*
2973 * Generic counter overflow handling. 3450 * Generic counter overflow handling, sampling.
2974 */ 3451 */
2975 3452
2976int perf_counter_overflow(struct perf_counter *counter, int nmi, 3453int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3037,130 +3514,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3037 * Generic software counter infrastructure 3514 * Generic software counter infrastructure
3038 */ 3515 */
3039 3516
3040static void perf_swcounter_update(struct perf_counter *counter) 3517/*
3518 * We directly increment counter->count and keep a second value in
3519 * counter->hw.period_left to count intervals. This period counter
3520 * is kept in the range [-sample_period, 0] so that we can use the
3521 * sign as trigger.
3522 */
3523
3524static u64 perf_swcounter_set_period(struct perf_counter *counter)
3041{ 3525{
3042 struct hw_perf_counter *hwc = &counter->hw; 3526 struct hw_perf_counter *hwc = &counter->hw;
3043 u64 prev, now; 3527 u64 period = hwc->last_period;
3044 s64 delta; 3528 u64 nr, offset;
3529 s64 old, val;
3530
3531 hwc->last_period = hwc->sample_period;
3045 3532
3046again: 3533again:
3047 prev = atomic64_read(&hwc->prev_count); 3534 old = val = atomic64_read(&hwc->period_left);
3048 now = atomic64_read(&hwc->count); 3535 if (val < 0)
3049 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3536 return 0;
3050 goto again;
3051 3537
3052 delta = now - prev; 3538 nr = div64_u64(period + val, period);
3539 offset = nr * period;
3540 val -= offset;
3541 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3542 goto again;
3053 3543
3054 atomic64_add(delta, &counter->count); 3544 return nr;
3055 atomic64_sub(delta, &hwc->period_left);
3056} 3545}
3057 3546
3058static void perf_swcounter_set_period(struct perf_counter *counter) 3547static void perf_swcounter_overflow(struct perf_counter *counter,
3548 int nmi, struct perf_sample_data *data)
3059{ 3549{
3060 struct hw_perf_counter *hwc = &counter->hw; 3550 struct hw_perf_counter *hwc = &counter->hw;
3061 s64 left = atomic64_read(&hwc->period_left); 3551 u64 overflow;
3062 s64 period = hwc->sample_period;
3063 3552
3064 if (unlikely(left <= -period)) { 3553 data->period = counter->hw.last_period;
3065 left = period; 3554 overflow = perf_swcounter_set_period(counter);
3066 atomic64_set(&hwc->period_left, left);
3067 hwc->last_period = period;
3068 }
3069 3555
3070 if (unlikely(left <= 0)) { 3556 if (hwc->interrupts == MAX_INTERRUPTS)
3071 left += period; 3557 return;
3072 atomic64_add(period, &hwc->period_left);
3073 hwc->last_period = period;
3074 }
3075 3558
3076 atomic64_set(&hwc->prev_count, -left); 3559 for (; overflow; overflow--) {
3077 atomic64_set(&hwc->count, -left); 3560 if (perf_counter_overflow(counter, nmi, data)) {
3561 /*
3562 * We inhibit the overflow from happening when
3563 * hwc->interrupts == MAX_INTERRUPTS.
3564 */
3565 break;
3566 }
3567 }
3078} 3568}
3079 3569
3080static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3570static void perf_swcounter_unthrottle(struct perf_counter *counter)
3081{ 3571{
3082 enum hrtimer_restart ret = HRTIMER_RESTART;
3083 struct perf_sample_data data;
3084 struct perf_counter *counter;
3085 u64 period;
3086
3087 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3088 counter->pmu->read(counter);
3089
3090 data.addr = 0;
3091 data.regs = get_irq_regs();
3092 /* 3572 /*
3093 * In case we exclude kernel IPs or are somehow not in interrupt 3573 * Nothing to do, we already reset hwc->interrupts.
3094 * context, provide the next best thing, the user IP.
3095 */ 3574 */
3096 if ((counter->attr.exclude_kernel || !data.regs) &&
3097 !counter->attr.exclude_user)
3098 data.regs = task_pt_regs(current);
3099
3100 if (data.regs) {
3101 if (perf_counter_overflow(counter, 0, &data))
3102 ret = HRTIMER_NORESTART;
3103 }
3104
3105 period = max_t(u64, 10000, counter->hw.sample_period);
3106 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3107
3108 return ret;
3109} 3575}
3110 3576
3111static void perf_swcounter_overflow(struct perf_counter *counter, 3577static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3112 int nmi, struct pt_regs *regs, u64 addr) 3578 int nmi, struct perf_sample_data *data)
3113{ 3579{
3114 struct perf_sample_data data = { 3580 struct hw_perf_counter *hwc = &counter->hw;
3115 .regs = regs, 3581
3116 .addr = addr, 3582 atomic64_add(nr, &counter->count);
3117 .period = counter->hw.last_period,
3118 };
3119 3583
3120 perf_swcounter_update(counter); 3584 if (!hwc->sample_period)
3121 perf_swcounter_set_period(counter); 3585 return;
3122 if (perf_counter_overflow(counter, nmi, &data)) 3586
3123 /* soft-disable the counter */ 3587 if (!data->regs)
3124 ; 3588 return;
3125 3589
3590 if (!atomic64_add_negative(nr, &hwc->period_left))
3591 perf_swcounter_overflow(counter, nmi, data);
3126} 3592}
3127 3593
3128static int perf_swcounter_is_counting(struct perf_counter *counter) 3594static int perf_swcounter_is_counting(struct perf_counter *counter)
3129{ 3595{
3130 struct perf_counter_context *ctx; 3596 /*
3131 unsigned long flags; 3597 * The counter is active, we're good!
3132 int count; 3598 */
3133
3134 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3599 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3135 return 1; 3600 return 1;
3136 3601
3602 /*
3603 * The counter is off/error, not counting.
3604 */
3137 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3605 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3138 return 0; 3606 return 0;
3139 3607
3140 /* 3608 /*
3141 * If the counter is inactive, it could be just because 3609 * The counter is inactive, if the context is active
3142 * its task is scheduled out, or because it's in a group 3610 * we're part of a group that didn't make it on the 'pmu',
3143 * which could not go on the PMU. We want to count in 3611 * not counting.
3144 * the first case but not the second. If the context is
3145 * currently active then an inactive software counter must
3146 * be the second case. If it's not currently active then
3147 * we need to know whether the counter was active when the
3148 * context was last active, which we can determine by
3149 * comparing counter->tstamp_stopped with ctx->time.
3150 *
3151 * We are within an RCU read-side critical section,
3152 * which protects the existence of *ctx.
3153 */ 3612 */
3154 ctx = counter->ctx; 3613 if (counter->ctx->is_active)
3155 spin_lock_irqsave(&ctx->lock, flags); 3614 return 0;
3156 count = 1; 3615
3157 /* Re-check state now we have the lock */ 3616 /*
3158 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3617 * We're inactive and the context is too, this means the
3159 counter->ctx->is_active || 3618 * task is scheduled out, we're counting events that happen
3160 counter->tstamp_stopped < ctx->time) 3619 * to us, like migration events.
3161 count = 0; 3620 */
3162 spin_unlock_irqrestore(&ctx->lock, flags); 3621 return 1;
3163 return count;
3164} 3622}
3165 3623
3166static int perf_swcounter_match(struct perf_counter *counter, 3624static int perf_swcounter_match(struct perf_counter *counter,
@@ -3186,19 +3644,10 @@ static int perf_swcounter_match(struct perf_counter *counter,
3186 return 1; 3644 return 1;
3187} 3645}
3188 3646
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr)
3191{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193
3194 if (counter->hw.sample_period && !neg && regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr);
3196}
3197
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3647static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event, 3648 enum perf_type_id type,
3200 u64 nr, int nmi, struct pt_regs *regs, 3649 u32 event, u64 nr, int nmi,
3201 u64 addr) 3650 struct perf_sample_data *data)
3202{ 3651{
3203 struct perf_counter *counter; 3652 struct perf_counter *counter;
3204 3653
@@ -3207,8 +3656,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3207 3656
3208 rcu_read_lock(); 3657 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3658 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs)) 3659 if (perf_swcounter_match(counter, type, event, data->regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr); 3660 perf_swcounter_add(counter, nr, nmi, data);
3212 } 3661 }
3213 rcu_read_unlock(); 3662 rcu_read_unlock();
3214} 3663}
@@ -3227,9 +3676,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3227 return &cpuctx->recursion[0]; 3676 return &cpuctx->recursion[0];
3228} 3677}
3229 3678
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event, 3679static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs, 3680 u64 nr, int nmi,
3232 u64 addr) 3681 struct perf_sample_data *data)
3233{ 3682{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3683 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx); 3684 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3691,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3242 barrier(); 3691 barrier();
3243 3692
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3693 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr); 3694 nr, nmi, data);
3246 rcu_read_lock(); 3695 rcu_read_lock();
3247 /* 3696 /*
3248 * doesn't really matter which of the child contexts the 3697 * doesn't really matter which of the child contexts the
@@ -3250,7 +3699,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3250 */ 3699 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp); 3700 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx) 3701 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); 3702 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3254 rcu_read_unlock(); 3703 rcu_read_unlock();
3255 3704
3256 barrier(); 3705 barrier();
@@ -3260,35 +3709,79 @@ out:
3260 put_cpu_var(perf_cpu_context); 3709 put_cpu_var(perf_cpu_context);
3261} 3710}
3262 3711
3263void 3712void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3713 struct pt_regs *regs, u64 addr)
3265{ 3714{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); 3715 struct perf_sample_data data = {
3716 .regs = regs,
3717 .addr = addr,
3718 };
3719
3720 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3267} 3721}
3268 3722
3269static void perf_swcounter_read(struct perf_counter *counter) 3723static void perf_swcounter_read(struct perf_counter *counter)
3270{ 3724{
3271 perf_swcounter_update(counter);
3272} 3725}
3273 3726
3274static int perf_swcounter_enable(struct perf_counter *counter) 3727static int perf_swcounter_enable(struct perf_counter *counter)
3275{ 3728{
3276 perf_swcounter_set_period(counter); 3729 struct hw_perf_counter *hwc = &counter->hw;
3730
3731 if (hwc->sample_period) {
3732 hwc->last_period = hwc->sample_period;
3733 perf_swcounter_set_period(counter);
3734 }
3277 return 0; 3735 return 0;
3278} 3736}
3279 3737
3280static void perf_swcounter_disable(struct perf_counter *counter) 3738static void perf_swcounter_disable(struct perf_counter *counter)
3281{ 3739{
3282 perf_swcounter_update(counter);
3283} 3740}
3284 3741
3285static const struct pmu perf_ops_generic = { 3742static const struct pmu perf_ops_generic = {
3286 .enable = perf_swcounter_enable, 3743 .enable = perf_swcounter_enable,
3287 .disable = perf_swcounter_disable, 3744 .disable = perf_swcounter_disable,
3288 .read = perf_swcounter_read, 3745 .read = perf_swcounter_read,
3746 .unthrottle = perf_swcounter_unthrottle,
3289}; 3747};
3290 3748
3291/* 3749/*
3750 * hrtimer based swcounter callback
3751 */
3752
3753static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3754{
3755 enum hrtimer_restart ret = HRTIMER_RESTART;
3756 struct perf_sample_data data;
3757 struct perf_counter *counter;
3758 u64 period;
3759
3760 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3761 counter->pmu->read(counter);
3762
3763 data.addr = 0;
3764 data.regs = get_irq_regs();
3765 /*
3766 * In case we exclude kernel IPs or are somehow not in interrupt
3767 * context, provide the next best thing, the user IP.
3768 */
3769 if ((counter->attr.exclude_kernel || !data.regs) &&
3770 !counter->attr.exclude_user)
3771 data.regs = task_pt_regs(current);
3772
3773 if (data.regs) {
3774 if (perf_counter_overflow(counter, 0, &data))
3775 ret = HRTIMER_NORESTART;
3776 }
3777
3778 period = max_t(u64, 10000, counter->hw.sample_period);
3779 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3780
3781 return ret;
3782}
3783
3784/*
3292 * Software counter: cpu wall time clock 3785 * Software counter: cpu wall time clock
3293 */ 3786 */
3294 3787
@@ -3404,36 +3897,25 @@ static const struct pmu perf_ops_task_clock = {
3404 .read = task_clock_perf_counter_read, 3897 .read = task_clock_perf_counter_read,
3405}; 3898};
3406 3899
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE 3900#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id) 3901void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3902 int entry_size)
3430{ 3903{
3431 struct pt_regs *regs = get_irq_regs(); 3904 struct perf_raw_record raw = {
3905 .size = entry_size,
3906 .data = record,
3907 };
3908
3909 struct perf_sample_data data = {
3910 .regs = get_irq_regs(),
3911 .addr = addr,
3912 .raw = &raw,
3913 };
3432 3914
3433 if (!regs) 3915 if (!data.regs)
3434 regs = task_pt_regs(current); 3916 data.regs = task_pt_regs(current);
3435 3917
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); 3918 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3437} 3919}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3920EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439 3921
@@ -3442,16 +3924,20 @@ extern void ftrace_profile_disable(int);
3442 3924
3443static void tp_perf_counter_destroy(struct perf_counter *counter) 3925static void tp_perf_counter_destroy(struct perf_counter *counter)
3444{ 3926{
3445 ftrace_profile_disable(perf_event_id(&counter->attr)); 3927 ftrace_profile_disable(counter->attr.config);
3446} 3928}
3447 3929
3448static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3930static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3449{ 3931{
3450 int event_id = perf_event_id(&counter->attr); 3932 /*
3451 int ret; 3933 * Raw tracepoint data is a severe data leak, only allow root to
3934 * have these.
3935 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3937 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM);
3452 3939
3453 ret = ftrace_profile_enable(event_id); 3940 if (ftrace_profile_enable(counter->attr.config))
3454 if (ret)
3455 return NULL; 3941 return NULL;
3456 3942
3457 counter->destroy = tp_perf_counter_destroy; 3943 counter->destroy = tp_perf_counter_destroy;
@@ -3465,9 +3951,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3465} 3951}
3466#endif 3952#endif
3467 3953
3954atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3955
3956static void sw_perf_counter_destroy(struct perf_counter *counter)
3957{
3958 u64 event = counter->attr.config;
3959
3960 WARN_ON(counter->parent);
3961
3962 atomic_dec(&perf_swcounter_enabled[event]);
3963}
3964
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 3965static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{ 3966{
3470 const struct pmu *pmu = NULL; 3967 const struct pmu *pmu = NULL;
3968 u64 event = counter->attr.config;
3471 3969
3472 /* 3970 /*
3473 * Software counters (currently) can't in general distinguish 3971 * Software counters (currently) can't in general distinguish
@@ -3476,7 +3974,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3476 * to be kernel events, and page faults are never hypervisor 3974 * to be kernel events, and page faults are never hypervisor
3477 * events. 3975 * events.
3478 */ 3976 */
3479 switch (counter->attr.config) { 3977 switch (event) {
3480 case PERF_COUNT_SW_CPU_CLOCK: 3978 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock; 3979 pmu = &perf_ops_cpu_clock;
3482 3980
@@ -3497,6 +3995,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 3995 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES: 3996 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS: 3997 case PERF_COUNT_SW_CPU_MIGRATIONS:
3998 if (!counter->parent) {
3999 atomic_inc(&perf_swcounter_enabled[event]);
4000 counter->destroy = sw_perf_counter_destroy;
4001 }
3500 pmu = &perf_ops_generic; 4002 pmu = &perf_ops_generic;
3501 break; 4003 break;
3502 } 4004 }
@@ -3512,6 +4014,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu, 4014 int cpu,
3513 struct perf_counter_context *ctx, 4015 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader, 4016 struct perf_counter *group_leader,
4017 struct perf_counter *parent_counter,
3515 gfp_t gfpflags) 4018 gfp_t gfpflags)
3516{ 4019{
3517 const struct pmu *pmu; 4020 const struct pmu *pmu;
@@ -3547,6 +4050,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3547 counter->ctx = ctx; 4050 counter->ctx = ctx;
3548 counter->oncpu = -1; 4051 counter->oncpu = -1;
3549 4052
4053 counter->parent = parent_counter;
4054
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 4055 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id); 4056 counter->id = atomic64_inc_return(&perf_counter_id);
3552 4057
@@ -3561,13 +4066,14 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3561 hwc->sample_period = attr->sample_period; 4066 hwc->sample_period = attr->sample_period;
3562 if (attr->freq && attr->sample_freq) 4067 if (attr->freq && attr->sample_freq)
3563 hwc->sample_period = 1; 4068 hwc->sample_period = 1;
4069 hwc->last_period = hwc->sample_period;
3564 4070
3565 atomic64_set(&hwc->period_left, hwc->sample_period); 4071 atomic64_set(&hwc->period_left, hwc->sample_period);
3566 4072
3567 /* 4073 /*
3568 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4074 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3569 */ 4075 */
3570 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4076 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3571 goto done; 4077 goto done;
3572 4078
3573 switch (attr->type) { 4079 switch (attr->type) {
@@ -3604,11 +4110,15 @@ done:
3604 4110
3605 counter->pmu = pmu; 4111 counter->pmu = pmu;
3606 4112
3607 atomic_inc(&nr_counters); 4113 if (!counter->parent) {
3608 if (counter->attr.mmap) 4114 atomic_inc(&nr_counters);
3609 atomic_inc(&nr_mmap_counters); 4115 if (counter->attr.mmap)
3610 if (counter->attr.comm) 4116 atomic_inc(&nr_mmap_counters);
3611 atomic_inc(&nr_comm_counters); 4117 if (counter->attr.comm)
4118 atomic_inc(&nr_comm_counters);
4119 if (counter->attr.task)
4120 atomic_inc(&nr_task_counters);
4121 }
3612 4122
3613 return counter; 4123 return counter;
3614} 4124}
@@ -3771,7 +4281,7 @@ SYSCALL_DEFINE5(perf_counter_open,
3771 } 4281 }
3772 4282
3773 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3774 GFP_KERNEL); 4284 NULL, GFP_KERNEL);
3775 ret = PTR_ERR(counter); 4285 ret = PTR_ERR(counter);
3776 if (IS_ERR(counter)) 4286 if (IS_ERR(counter))
3777 goto err_put_context; 4287 goto err_put_context;
@@ -3837,7 +4347,8 @@ inherit_counter(struct perf_counter *parent_counter,
3837 4347
3838 child_counter = perf_counter_alloc(&parent_counter->attr, 4348 child_counter = perf_counter_alloc(&parent_counter->attr,
3839 parent_counter->cpu, child_ctx, 4349 parent_counter->cpu, child_ctx,
3840 group_leader, GFP_KERNEL); 4350 group_leader, parent_counter,
4351 GFP_KERNEL);
3841 if (IS_ERR(child_counter)) 4352 if (IS_ERR(child_counter))
3842 return child_counter; 4353 return child_counter;
3843 get_ctx(child_ctx); 4354 get_ctx(child_ctx);
@@ -3860,12 +4371,6 @@ inherit_counter(struct perf_counter *parent_counter,
3860 */ 4371 */
3861 add_counter_to_ctx(child_counter, child_ctx); 4372 add_counter_to_ctx(child_counter, child_ctx);
3862 4373
3863 child_counter->parent = parent_counter;
3864 /*
3865 * inherit into child's child as well:
3866 */
3867 child_counter->attr.inherit = 1;
3868
3869 /* 4374 /*
3870 * Get a reference to the parent filp - we will fput it 4375 * Get a reference to the parent filp - we will fput it
3871 * when the child counter exits. This is safe to do because 4376 * when the child counter exits. This is safe to do because
@@ -3909,10 +4414,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3909} 4414}
3910 4415
3911static void sync_child_counter(struct perf_counter *child_counter, 4416static void sync_child_counter(struct perf_counter *child_counter,
3912 struct perf_counter *parent_counter) 4417 struct task_struct *child)
3913{ 4418{
4419 struct perf_counter *parent_counter = child_counter->parent;
3914 u64 child_val; 4420 u64 child_val;
3915 4421
4422 if (child_counter->attr.inherit_stat)
4423 perf_counter_read_event(child_counter, child);
4424
3916 child_val = atomic64_read(&child_counter->count); 4425 child_val = atomic64_read(&child_counter->count);
3917 4426
3918 /* 4427 /*
@@ -3941,7 +4450,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3941 4450
3942static void 4451static void
3943__perf_counter_exit_task(struct perf_counter *child_counter, 4452__perf_counter_exit_task(struct perf_counter *child_counter,
3944 struct perf_counter_context *child_ctx) 4453 struct perf_counter_context *child_ctx,
4454 struct task_struct *child)
3945{ 4455{
3946 struct perf_counter *parent_counter; 4456 struct perf_counter *parent_counter;
3947 4457
@@ -3955,7 +4465,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3955 * counters need to be zapped - but otherwise linger. 4465 * counters need to be zapped - but otherwise linger.
3956 */ 4466 */
3957 if (parent_counter) { 4467 if (parent_counter) {
3958 sync_child_counter(child_counter, parent_counter); 4468 sync_child_counter(child_counter, child);
3959 free_counter(child_counter); 4469 free_counter(child_counter);
3960 } 4470 }
3961} 4471}
@@ -3969,8 +4479,10 @@ void perf_counter_exit_task(struct task_struct *child)
3969 struct perf_counter_context *child_ctx; 4479 struct perf_counter_context *child_ctx;
3970 unsigned long flags; 4480 unsigned long flags;
3971 4481
3972 if (likely(!child->perf_counter_ctxp)) 4482 if (likely(!child->perf_counter_ctxp)) {
4483 perf_counter_task(child, NULL, 0);
3973 return; 4484 return;
4485 }
3974 4486
3975 local_irq_save(flags); 4487 local_irq_save(flags);
3976 /* 4488 /*
@@ -3989,17 +4501,20 @@ void perf_counter_exit_task(struct task_struct *child)
3989 */ 4501 */
3990 spin_lock(&child_ctx->lock); 4502 spin_lock(&child_ctx->lock);
3991 child->perf_counter_ctxp = NULL; 4503 child->perf_counter_ctxp = NULL;
3992 if (child_ctx->parent_ctx) { 4504 /*
3993 /* 4505 * If this context is a clone; unclone it so it can't get
3994 * This context is a clone; unclone it so it can't get 4506 * swapped to another process while we're removing all
3995 * swapped to another process while we're removing all 4507 * the counters from it.
3996 * the counters from it. 4508 */
3997 */ 4509 unclone_ctx(child_ctx);
3998 put_ctx(child_ctx->parent_ctx); 4510 spin_unlock_irqrestore(&child_ctx->lock, flags);
3999 child_ctx->parent_ctx = NULL; 4511
4000 } 4512 /*
4001 spin_unlock(&child_ctx->lock); 4513 * Report the task dead after unscheduling the counters so that we
4002 local_irq_restore(flags); 4514 * won't get any samples after PERF_EVENT_EXIT. We can however still
4515 * get a few PERF_EVENT_READ events.
4516 */
4517 perf_counter_task(child, child_ctx, 0);
4003 4518
4004 /* 4519 /*
4005 * We can recurse on the same lock type through: 4520 * We can recurse on the same lock type through:
@@ -4017,7 +4532,7 @@ void perf_counter_exit_task(struct task_struct *child)
4017again: 4532again:
4018 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4533 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4019 list_entry) 4534 list_entry)
4020 __perf_counter_exit_task(child_counter, child_ctx); 4535 __perf_counter_exit_task(child_counter, child_ctx, child);
4021 4536
4022 /* 4537 /*
4023 * If the last counter was a group counter, it will have appended all 4538 * If the last counter was a group counter, it will have appended all
@@ -4220,6 +4735,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4220 perf_counter_init_cpu(cpu); 4735 perf_counter_init_cpu(cpu);
4221 break; 4736 break;
4222 4737
4738 case CPU_ONLINE:
4739 case CPU_ONLINE_FROZEN:
4740 hw_perf_counter_setup_online(cpu);
4741 break;
4742
4223 case CPU_DOWN_PREPARE: 4743 case CPU_DOWN_PREPARE:
4224 case CPU_DOWN_PREPARE_FROZEN: 4744 case CPU_DOWN_PREPARE_FROZEN:
4225 perf_counter_exit_cpu(cpu); 4745 perf_counter_exit_cpu(cpu);
@@ -4244,6 +4764,8 @@ void __init perf_counter_init(void)
4244{ 4764{
4245 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4765 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4246 (void *)(long)smp_processor_id()); 4766 (void *)(long)smp_processor_id());
4767 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4768 (void *)(long)smp_processor_id());
4247 register_cpu_notifier(&perf_cpu_nb); 4769 register_cpu_notifier(&perf_cpu_nb);
4248} 4770}
4249 4771
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd281..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */ 380 */
381struct task_struct *find_task_by_pid_type_ns(int type, int nr, 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 struct pid_namespace *ns)
383{ 382{
384 return pid_task(find_pid_ns(nr, ns), type); 383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
385} 384}
386 385
387EXPORT_SYMBOL(find_task_by_pid_type_ns);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 386struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 387{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399} 389}
400EXPORT_SYMBOL(find_task_by_pid_ns);
401 390
402struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 391struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 392{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858d..821722ae58a7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
67 return NULL; 67 return NULL;
68} 68}
69 69
70static struct pid_namespace *create_pid_namespace(unsigned int level) 70static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
71{ 71{
72 struct pid_namespace *ns; 72 struct pid_namespace *ns;
73 unsigned int level = parent_pid_ns->level + 1;
73 int i; 74 int i;
74 75
75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 76 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
86 87
87 kref_init(&ns->kref); 88 kref_init(&ns->kref);
88 ns->level = level; 89 ns->level = level;
90 ns->parent = get_pid_ns(parent_pid_ns);
89 91
90 set_bit(0, ns->pidmap[0].page); 92 set_bit(0, ns->pidmap[0].page);
91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 93 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
114 116
115struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 117struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
116{ 118{
117 struct pid_namespace *new_ns;
118
119 BUG_ON(!old_ns);
120 new_ns = get_pid_ns(old_ns);
121 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
122 goto out; 120 return get_pid_ns(old_ns);
123
124 new_ns = ERR_PTR(-EINVAL);
125 if (flags & CLONE_THREAD) 121 if (flags & CLONE_THREAD)
126 goto out_put; 122 return ERR_PTR(-EINVAL);
127 123 return create_pid_namespace(old_ns);
128 new_ns = create_pid_namespace(old_ns->level + 1);
129 if (!IS_ERR(new_ns))
130 new_ns->parent = get_pid_ns(old_ns);
131
132out_put:
133 put_pid_ns(old_ns);
134out:
135 return new_ns;
136} 124}
137 125
138void free_pid_ns(struct kref *kref) 126void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 69911b5745eb..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f6d8b8cb5e34..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -167,67 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
167int ptrace_attach(struct task_struct *task) 167int ptrace_attach(struct task_struct *task)
168{ 168{
169 int retval; 169 int retval;
170 unsigned long flags;
171 170
172 audit_ptrace(task); 171 audit_ptrace(task);
173 172
174 retval = -EPERM; 173 retval = -EPERM;
174 if (unlikely(task->flags & PF_KTHREAD))
175 goto out;
175 if (same_thread_group(task, current)) 176 if (same_thread_group(task, current))
176 goto out; 177 goto out;
177 178
178 /* Protect the target's credential calculations against our 179 /*
180 * Protect exec's credential calculations against our interference;
179 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
180 * under ptrace. 182 * under ptrace.
181 */ 183 */
182 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
183 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
184 goto out; 186 goto out;
185 187
186 retval = -EPERM;
187repeat:
188 /*
189 * Nasty, nasty.
190 *
191 * We want to hold both the task-lock and the
192 * tasklist_lock for writing at the same time.
193 * But that's against the rules (tasklist_lock
194 * is taken for reading by interrupts on other
195 * cpu's that may have task_lock).
196 */
197 task_lock(task); 188 task_lock(task);
198 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
199 task_unlock(task);
200 do {
201 cpu_relax();
202 } while (!write_can_lock(&tasklist_lock));
203 goto repeat;
204 }
205
206 if (!task->mm)
207 goto bad;
208 /* the same process cannot be attached many times */
209 if (task->ptrace & PT_PTRACED)
210 goto bad;
211 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); 189 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
190 task_unlock(task);
212 if (retval) 191 if (retval)
213 goto bad; 192 goto unlock_creds;
214 193
215 /* Go */ 194 write_lock_irq(&tasklist_lock);
216 task->ptrace |= PT_PTRACED; 195 retval = -EPERM;
196 if (unlikely(task->exit_state))
197 goto unlock_tasklist;
198 if (task->ptrace)
199 goto unlock_tasklist;
200
201 task->ptrace = PT_PTRACED;
217 if (capable(CAP_SYS_PTRACE)) 202 if (capable(CAP_SYS_PTRACE))
218 task->ptrace |= PT_PTRACE_CAP; 203 task->ptrace |= PT_PTRACE_CAP;
219 204
220 __ptrace_link(task, current); 205 __ptrace_link(task, current);
221
222 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 206 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
223bad: 207
224 write_unlock_irqrestore(&tasklist_lock, flags); 208 retval = 0;
225 task_unlock(task); 209unlock_tasklist:
210 write_unlock_irq(&tasklist_lock);
211unlock_creds:
226 mutex_unlock(&task->cred_guard_mutex); 212 mutex_unlock(&task->cred_guard_mutex);
227out: 213out:
228 return retval; 214 return retval;
229} 215}
230 216
217/**
218 * ptrace_traceme -- helper for PTRACE_TRACEME
219 *
220 * Performs checks and sets PT_PTRACED.
221 * Should be used by all ptrace implementations for PTRACE_TRACEME.
222 */
223int ptrace_traceme(void)
224{
225 int ret = -EPERM;
226
227 write_lock_irq(&tasklist_lock);
228 /* Are we already being traced? */
229 if (!current->ptrace) {
230 ret = security_ptrace_traceme(current->parent);
231 /*
232 * Check PF_EXITING to ensure ->real_parent has not passed
233 * exit_ptrace(). Otherwise we don't report the error but
234 * pretend ->real_parent untraces us right after return.
235 */
236 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
237 current->ptrace = PT_PTRACED;
238 __ptrace_link(current, current->real_parent);
239 }
240 }
241 write_unlock_irq(&tasklist_lock);
242
243 return ret;
244}
245
231/* 246/*
232 * Called with irqs disabled, returns true if childs should reap themselves. 247 * Called with irqs disabled, returns true if childs should reap themselves.
233 */ 248 */
@@ -409,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
409 424
410static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 425static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
411{ 426{
427 unsigned long flags;
412 int error = -ESRCH; 428 int error = -ESRCH;
413 429
414 read_lock(&tasklist_lock); 430 if (lock_task_sighand(child, &flags)) {
415 if (likely(child->sighand != NULL)) {
416 error = -EINVAL; 431 error = -EINVAL;
417 spin_lock_irq(&child->sighand->siglock);
418 if (likely(child->last_siginfo != NULL)) { 432 if (likely(child->last_siginfo != NULL)) {
419 *info = *child->last_siginfo; 433 *info = *child->last_siginfo;
420 error = 0; 434 error = 0;
421 } 435 }
422 spin_unlock_irq(&child->sighand->siglock); 436 unlock_task_sighand(child, &flags);
423 } 437 }
424 read_unlock(&tasklist_lock);
425 return error; 438 return error;
426} 439}
427 440
428static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) 441static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
429{ 442{
443 unsigned long flags;
430 int error = -ESRCH; 444 int error = -ESRCH;
431 445
432 read_lock(&tasklist_lock); 446 if (lock_task_sighand(child, &flags)) {
433 if (likely(child->sighand != NULL)) {
434 error = -EINVAL; 447 error = -EINVAL;
435 spin_lock_irq(&child->sighand->siglock);
436 if (likely(child->last_siginfo != NULL)) { 448 if (likely(child->last_siginfo != NULL)) {
437 *child->last_siginfo = *info; 449 *child->last_siginfo = *info;
438 error = 0; 450 error = 0;
439 } 451 }
440 spin_unlock_irq(&child->sighand->siglock); 452 unlock_task_sighand(child, &flags);
441 } 453 }
442 read_unlock(&tasklist_lock);
443 return error; 454 return error;
444} 455}
445 456
@@ -566,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
566 return ret; 577 return ret;
567} 578}
568 579
569/** 580static struct task_struct *ptrace_get_task_struct(pid_t pid)
570 * ptrace_traceme -- helper for PTRACE_TRACEME
571 *
572 * Performs checks and sets PT_PTRACED.
573 * Should be used by all ptrace implementations for PTRACE_TRACEME.
574 */
575int ptrace_traceme(void)
576{
577 int ret = -EPERM;
578
579 /*
580 * Are we already being traced?
581 */
582repeat:
583 task_lock(current);
584 if (!(current->ptrace & PT_PTRACED)) {
585 /*
586 * See ptrace_attach() comments about the locking here.
587 */
588 unsigned long flags;
589 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
590 task_unlock(current);
591 do {
592 cpu_relax();
593 } while (!write_can_lock(&tasklist_lock));
594 goto repeat;
595 }
596
597 ret = security_ptrace_traceme(current->parent);
598
599 /*
600 * Check PF_EXITING to ensure ->real_parent has not passed
601 * exit_ptrace(). Otherwise we don't report the error but
602 * pretend ->real_parent untraces us right after return.
603 */
604 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
605 current->ptrace |= PT_PTRACED;
606 __ptrace_link(current, current->real_parent);
607 }
608
609 write_unlock_irqrestore(&tasklist_lock, flags);
610 }
611 task_unlock(current);
612 return ret;
613}
614
615/**
616 * ptrace_get_task_struct -- grab a task struct reference for ptrace
617 * @pid: process id to grab a task_struct reference of
618 *
619 * This function is a helper for ptrace implementations. It checks
620 * permissions and then grabs a task struct for use of the actual
621 * ptrace implementation.
622 *
623 * Returns the task_struct for @pid or an ERR_PTR() on failure.
624 */
625struct task_struct *ptrace_get_task_struct(pid_t pid)
626{ 581{
627 struct task_struct *child; 582 struct task_struct *child;
628 583
629 read_lock(&tasklist_lock); 584 rcu_read_lock();
630 child = find_task_by_vpid(pid); 585 child = find_task_by_vpid(pid);
631 if (child) 586 if (child)
632 get_task_struct(child); 587 get_task_struct(child);
588 rcu_read_unlock();
633 589
634 read_unlock(&tasklist_lock);
635 if (!child) 590 if (!child)
636 return ERR_PTR(-ESRCH); 591 return ERR_PTR(-ESRCH);
637 return child; 592 return child;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1533,7 +1533,7 @@ void __init __rcu_init(void)
1533 int j; 1533 int j;
1534 struct rcu_node *rnp; 1534 struct rcu_node *rnp;
1535 1535
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1536 printk(KERN_INFO "Hierarchical RCU implementation.\n");
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1546,7 +1546,6 @@ void __init __rcu_init(void)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); 1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */ 1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb); 1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1549}
1551 1550
1552module_param(blimit, int, 0); 1551module_param(blimit, int, 0);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c803..e1338f074314 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
18void res_counter_init(struct res_counter *counter, struct res_counter *parent) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
133 unsigned long long *res) 133 unsigned long long *res)
134{ 134{
135 char *end; 135 char *end;
136
137 /* return RESOURCE_MAX(unlimited) if "-1" is specified */
138 if (*buf == '-') {
139 *res = simple_strtoull(buf + 1, &end, 10);
140 if (*res != 1 || *end != '\0')
141 return -EINVAL;
142 *res = RESOURCE_MAX;
143 return 0;
144 }
145
136 /* FIXME - make memparse() take const char* args */ 146 /* FIXME - make memparse() take const char* args */
137 *res = memparse((char *)buf, &end); 147 *res = memparse((char *)buf, &end);
138 if (*end != '\0') 148 if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5a..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { 1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1044 1043 spin_unlock(&lock->wait_lock);
1045 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1; 1045 return 1;
1047 } 1046 }
1048 1047
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050 1049
1051
1052 if (ret && !waiter->task) { 1050 if (ret && !waiter->task) {
1053 /* 1051 /*
1054 * Reset the return value. We might have 1052 * Reset the return value. We might have
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a906aaa..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -493,6 +493,7 @@ struct rt_rq {
493#endif 493#endif
494#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
496 int overloaded; 497 int overloaded;
497 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
498#endif 499#endif
@@ -1978,7 +1979,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1978 if (task_hot(p, old_rq->clock, NULL)) 1979 if (task_hot(p, old_rq->clock, NULL))
1979 schedstat_inc(p, se.nr_forced2_migrations); 1980 schedstat_inc(p, se.nr_forced2_migrations);
1980#endif 1981#endif
1981 perf_counter_task_migration(p, new_cpu); 1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0);
1982 } 1984 }
1983 p->se.vruntime -= old_cfsrq->min_vruntime - 1985 p->se.vruntime -= old_cfsrq->min_vruntime -
1984 new_cfsrq->min_vruntime; 1986 new_cfsrq->min_vruntime;
@@ -2570,15 +2572,37 @@ static void __sched_fork(struct task_struct *p)
2570 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2571 2573
2572#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2573 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2574 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2575 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2576 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2577 p->se.sleep_max = 0; 2579
2578 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2579 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2580 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2581 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2582#endif 2606#endif
2583 2607
2584 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -6540,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6540 return 0; 6564 return 0;
6541} 6565}
6542 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6543static void __cond_resched(void) 6572static void __cond_resched(void)
6544{ 6573{
6545#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6559,8 +6588,7 @@ static void __cond_resched(void)
6559 6588
6560int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6561{ 6590{
6562 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6563 system_state == SYSTEM_RUNNING) {
6564 __cond_resched(); 6592 __cond_resched();
6565 return 1; 6593 return 1;
6566 } 6594 }
@@ -6578,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6578 */ 6606 */
6579int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6580{ 6608{
6581 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6582 int ret = 0; 6610 int ret = 0;
6583 6611
6584 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6585 spin_unlock(lock); 6613 spin_unlock(lock);
6586 if (resched && need_resched()) 6614 if (resched)
6587 __cond_resched(); 6615 __cond_resched();
6588 else 6616 else
6589 cpu_relax(); 6617 cpu_relax();
@@ -6598,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6598{ 6626{
6599 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6600 6628
6601 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6602 local_bh_enable(); 6630 local_bh_enable();
6603 __cond_resched(); 6631 __cond_resched();
6604 local_bh_disable(); 6632 local_bh_disable();
@@ -7045,7 +7073,7 @@ static int migration_thread(void *data)
7045 7073
7046 if (cpu_is_offline(cpu)) { 7074 if (cpu_is_offline(cpu)) {
7047 spin_unlock_irq(&rq->lock); 7075 spin_unlock_irq(&rq->lock);
7048 goto wait_to_die; 7076 break;
7049 } 7077 }
7050 7078
7051 if (rq->active_balance) { 7079 if (rq->active_balance) {
@@ -7071,16 +7099,7 @@ static int migration_thread(void *data)
7071 complete(&req->done); 7099 complete(&req->done);
7072 } 7100 }
7073 __set_current_state(TASK_RUNNING); 7101 __set_current_state(TASK_RUNNING);
7074 return 0;
7075 7102
7076wait_to_die:
7077 /* Wait for kthread_stop */
7078 set_current_state(TASK_INTERRUPTIBLE);
7079 while (!kthread_should_stop()) {
7080 schedule();
7081 set_current_state(TASK_INTERRUPTIBLE);
7082 }
7083 __set_current_state(TASK_RUNNING);
7084 return 0; 7103 return 0;
7085} 7104}
7086 7105
@@ -7270,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7270static void calc_global_load_remove(struct rq *rq) 7289static void calc_global_load_remove(struct rq *rq)
7271{ 7290{
7272 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7273} 7293}
7274#endif /* CONFIG_HOTPLUG_CPU */ 7294#endif /* CONFIG_HOTPLUG_CPU */
7275 7295
@@ -7494,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7494 rq = task_rq_lock(p, &flags); 7514 rq = task_rq_lock(p, &flags);
7495 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7515 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7496 task_rq_unlock(rq, &flags); 7516 task_rq_unlock(rq, &flags);
7517 get_task_struct(p);
7497 cpu_rq(cpu)->migration_thread = p; 7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7498 break; 7520 break;
7499 7521
7500 case CPU_ONLINE: 7522 case CPU_ONLINE:
@@ -7505,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7505 /* Update our root-domain */ 7527 /* Update our root-domain */
7506 rq = cpu_rq(cpu); 7528 rq = cpu_rq(cpu);
7507 spin_lock_irqsave(&rq->lock, flags); 7529 spin_lock_irqsave(&rq->lock, flags);
7508 rq->calc_load_update = calc_load_update;
7509 rq->calc_load_active = 0;
7510 if (rq->rd) { 7530 if (rq->rd) {
7511 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7531 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7512 7532
@@ -7524,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7524 kthread_bind(cpu_rq(cpu)->migration_thread, 7544 kthread_bind(cpu_rq(cpu)->migration_thread,
7525 cpumask_any(cpu_online_mask)); 7545 cpumask_any(cpu_online_mask));
7526 kthread_stop(cpu_rq(cpu)->migration_thread); 7546 kthread_stop(cpu_rq(cpu)->migration_thread);
7547 put_task_struct(cpu_rq(cpu)->migration_thread);
7527 cpu_rq(cpu)->migration_thread = NULL; 7548 cpu_rq(cpu)->migration_thread = NULL;
7528 break; 7549 break;
7529 7550
@@ -7533,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7533 migrate_live_tasks(cpu); 7554 migrate_live_tasks(cpu);
7534 rq = cpu_rq(cpu); 7555 rq = cpu_rq(cpu);
7535 kthread_stop(rq->migration_thread); 7556 kthread_stop(rq->migration_thread);
7557 put_task_struct(rq->migration_thread);
7536 rq->migration_thread = NULL; 7558 rq->migration_thread = NULL;
7537 /* Idle task back to normal (off runqueue, low prio) */ 7559 /* Idle task back to normal (off runqueue, low prio) */
7538 spin_lock_irq(&rq->lock); 7560 spin_lock_irq(&rq->lock);
@@ -7828,7 +7850,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7828 free_rootdomain(old_rd); 7850 free_rootdomain(old_rd);
7829} 7851}
7830 7852
7831static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7853static int init_rootdomain(struct root_domain *rd, bool bootmem)
7832{ 7854{
7833 gfp_t gfp = GFP_KERNEL; 7855 gfp_t gfp = GFP_KERNEL;
7834 7856
@@ -9075,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9075#ifdef CONFIG_SMP 9097#ifdef CONFIG_SMP
9076 rt_rq->rt_nr_migratory = 0; 9098 rt_rq->rt_nr_migratory = 0;
9077 rt_rq->overloaded = 0; 9099 rt_rq->overloaded = 0;
9078 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9079#endif 9101#endif
9080 9102
9081 rt_rq->rt_time = 0; 9103 rt_rq->rt_time = 0;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7deffc9f0e5f..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
@@ -152,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
152 * 165 *
153 * Returns: -ENOMEM if memory fails. 166 * Returns: -ENOMEM if memory fails.
154 */ 167 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 168int cpupri_init(struct cpupri *cp, bool bootmem)
156{ 169{
157 gfp_t gfp = GFP_KERNEL; 170 gfp_t gfp = GFP_KERNEL;
158 int i; 171 int i;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162{ 162{
163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
164 spread, rq0_min_vruntime, spread0; 164 spread, rq0_min_vruntime, spread0;
165 struct rq *rq = &per_cpu(runqueues, cpu); 165 struct rq *rq = cpu_rq(cpu);
166 struct sched_entity *last; 166 struct sched_entity *last;
167 unsigned long flags; 167 unsigned long flags;
168 168
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 if (last) 191 if (last)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
248 248
249static void print_cpu(struct seq_file *m, int cpu) 249static void print_cpu(struct seq_file *m, int cpu)
250{ 250{
251 struct rq *rq = &per_cpu(runqueues, cpu); 251 struct rq *rq = cpu_rq(cpu);
252 252
253#ifdef CONFIG_X86 253#ifdef CONFIG_X86
254 { 254 {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f9650e8fe75..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 266 return min_vruntime;
267} 267}
268 268
269static inline int entity_before(struct sched_entity *a,
270 struct sched_entity *b)
271{
272 return (s64)(a->vruntime - b->vruntime) < 0;
273}
274
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 275static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 276{
271 return se->vruntime - cfs_rq->min_vruntime; 277 return se->vruntime - cfs_rq->min_vruntime;
@@ -430,12 +436,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
430 436
431 for_each_sched_entity(se) { 437 for_each_sched_entity(se) {
432 struct load_weight *load; 438 struct load_weight *load;
439 struct load_weight lw;
433 440
434 cfs_rq = cfs_rq_of(se); 441 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load; 442 load = &cfs_rq->load;
436 443
437 if (unlikely(!se->on_rq)) { 444 if (unlikely(!se->on_rq)) {
438 struct load_weight lw = cfs_rq->load; 445 lw = cfs_rq->load;
439 446
440 update_load_add(&lw, se->load.weight); 447 update_load_add(&lw, se->load.weight);
441 load = &lw; 448 load = &lw;
@@ -604,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
604static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 611static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
605{ 612{
606#ifdef CONFIG_SCHEDSTATS 613#ifdef CONFIG_SCHEDSTATS
614 struct task_struct *tsk = NULL;
615
616 if (entity_is_task(se))
617 tsk = task_of(se);
618
607 if (se->sleep_start) { 619 if (se->sleep_start) {
608 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
609 struct task_struct *tsk = task_of(se);
610 621
611 if ((s64)delta < 0) 622 if ((s64)delta < 0)
612 delta = 0; 623 delta = 0;
@@ -617,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
617 se->sleep_start = 0; 628 se->sleep_start = 0;
618 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
619 630
620 account_scheduler_latency(tsk, delta >> 10, 1); 631 if (tsk)
632 account_scheduler_latency(tsk, delta >> 10, 1);
621 } 633 }
622 if (se->block_start) { 634 if (se->block_start) {
623 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
624 struct task_struct *tsk = task_of(se);
625 636
626 if ((s64)delta < 0) 637 if ((s64)delta < 0)
627 delta = 0; 638 delta = 0;
@@ -632,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
632 se->block_start = 0; 643 se->block_start = 0;
633 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
634 645
635 /* 646 if (tsk) {
636 * Blocking time is in units of nanosecs, so shift by 20 to 647 /*
637 * get a milliseconds-range estimation of the amount of 648 * Blocking time is in units of nanosecs, so shift by
638 * time that the task spent sleeping: 649 * 20 to get a milliseconds-range estimation of the
639 */ 650 * amount of time that the task spent sleeping:
640 if (unlikely(prof_on == SLEEP_PROFILING)) { 651 */
641 652 if (unlikely(prof_on == SLEEP_PROFILING)) {
642 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 653 profile_hits(SLEEP_PROFILING,
643 delta >> 20); 654 (void *)get_wchan(tsk),
655 delta >> 20);
656 }
657 account_scheduler_latency(tsk, delta >> 10, 0);
644 } 658 }
645 account_scheduler_latency(tsk, delta >> 10, 0);
646 } 659 }
647#endif 660#endif
648} 661}
@@ -686,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
686 * all of which have the same weight. 699 * all of which have the same weight.
687 */ 700 */
688 if (sched_feat(NORMALIZED_SLEEPER) && 701 if (sched_feat(NORMALIZED_SLEEPER) &&
689 task_of(se)->policy != SCHED_IDLE) 702 (!entity_is_task(se) ||
703 task_of(se)->policy != SCHED_IDLE))
690 thresh = calc_delta_fair(thresh, se); 704 thresh = calc_delta_fair(thresh, se);
691 705
692 vruntime -= thresh; 706 vruntime -= thresh;
@@ -1015,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
1015 /* 1029 /*
1016 * Already in the rightmost position? 1030 * Already in the rightmost position?
1017 */ 1031 */
1018 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1032 if (unlikely(!rightmost || entity_before(rightmost, se)))
1019 return; 1033 return;
1020 1034
1021 /* 1035 /*
@@ -1711,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1711 1725
1712 /* 'curr' will be NULL if the child belongs to a different group */ 1726 /* 'curr' will be NULL if the child belongs to a different group */
1713 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1727 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1714 curr && curr->vruntime < se->vruntime) { 1728 curr && entity_before(curr, se)) {
1715 /* 1729 /*
1716 * Upon rescheduling, sched_class::put_prev_task() will place 1730 * Upon rescheduling, sched_class::put_prev_task() will place
1717 * 'current' within the tree based on its new key value. 1731 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
10 10
11#ifdef CONFIG_RT_GROUP_SCHED 11#ifdef CONFIG_RT_GROUP_SCHED
12 12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 16{
15 return rt_rq->rq; 17 return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 24
23#else /* CONFIG_RT_GROUP_SCHED */ 25#else /* CONFIG_RT_GROUP_SCHED */
24 26
27#define rt_entity_is_task(rt_se) (1)
28
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 30{
27 return container_of(rt_rq, struct rq, rt); 31 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 77
74static void update_rt_migration(struct rt_rq *rt_rq) 78static void update_rt_migration(struct rt_rq *rt_rq)
75{ 79{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 80 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 81 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 82 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 83 rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 90
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 92{
93 if (!rt_entity_is_task(rt_se))
94 return;
95
96 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
97
98 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 99 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 100 rt_rq->rt_nr_migratory++;
91 101
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 104
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 106{
107 if (!rt_entity_is_task(rt_se))
108 return;
109
110 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
111
112 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 113 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 114 rt_rq->rt_nr_migratory--;
99 115
diff --git a/kernel/signal.c b/kernel/signal.c
index d81f4952eebb..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1410,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1410 /* do_notify_parent_cldstop should have been called instead. */ 1410 /* do_notify_parent_cldstop should have been called instead. */
1411 BUG_ON(task_is_stopped_or_traced(tsk)); 1411 BUG_ON(task_is_stopped_or_traced(tsk));
1412 1412
1413 BUG_ON(!tsk->ptrace && 1413 BUG_ON(!task_ptrace(tsk) &&
1414 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1414 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1415 1415
1416 info.si_signo = sig; 1416 info.si_signo = sig;
@@ -1449,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1449 1449
1450 psig = tsk->parent->sighand; 1450 psig = tsk->parent->sighand;
1451 spin_lock_irqsave(&psig->siglock, flags); 1451 spin_lock_irqsave(&psig->siglock, flags);
1452 if (!tsk->ptrace && sig == SIGCHLD && 1452 if (!task_ptrace(tsk) && sig == SIGCHLD &&
1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1455 /* 1455 /*
@@ -1486,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1486 struct task_struct *parent; 1486 struct task_struct *parent;
1487 struct sighand_struct *sighand; 1487 struct sighand_struct *sighand;
1488 1488
1489 if (tsk->ptrace & PT_PTRACED) 1489 if (task_ptrace(tsk))
1490 parent = tsk->parent; 1490 parent = tsk->parent;
1491 else { 1491 else {
1492 tsk = tsk->group_leader; 1492 tsk = tsk->group_leader;
@@ -1499,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1499 * see comment in do_notify_parent() abot the following 3 lines 1499 * see comment in do_notify_parent() abot the following 3 lines
1500 */ 1500 */
1501 rcu_read_lock(); 1501 rcu_read_lock();
1502 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1502 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1503 info.si_uid = __task_cred(tsk)->uid; 1503 info.si_uid = __task_cred(tsk)->uid;
1504 rcu_read_unlock(); 1504 rcu_read_unlock();
1505 1505
@@ -1535,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1535 1535
1536static inline int may_ptrace_stop(void) 1536static inline int may_ptrace_stop(void)
1537{ 1537{
1538 if (!likely(current->ptrace & PT_PTRACED)) 1538 if (!likely(task_ptrace(current)))
1539 return 0; 1539 return 0;
1540 /* 1540 /*
1541 * Are we in the middle of do_coredump? 1541 * Are we in the middle of do_coredump?
@@ -1753,7 +1753,7 @@ static int do_signal_stop(int signr)
1753static int ptrace_signal(int signr, siginfo_t *info, 1753static int ptrace_signal(int signr, siginfo_t *info,
1754 struct pt_regs *regs, void *cookie) 1754 struct pt_regs *regs, void *cookie)
1755{ 1755{
1756 if (!(current->ptrace & PT_PTRACED)) 1756 if (!task_ptrace(current))
1757 return signr; 1757 return signr;
1758 1758
1759 ptrace_signal_deliver(regs, cookie); 1759 ptrace_signal_deliver(regs, cookie);
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2454 stack_t oss; 2454 stack_t oss;
2455 int error; 2455 int error;
2456 2456
2457 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2458 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2459 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2460 oss.ss_flags = sas_ss_flags(sp);
2461 }
2462 2460
2463 if (uss) { 2461 if (uss) {
2464 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2466 int ss_flags; 2464 int ss_flags;
2467 2465
2468 error = -EFAULT; 2466 error = -EFAULT;
2469 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2470 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2471 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2472 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2473 goto out; 2473 goto out;
2474 2474
2475 error = -EPERM; 2475 error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2501 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2502 } 2502 }
2503 2503
2504 error = 0;
2504 if (uoss) { 2505 if (uoss) {
2505 error = -EFAULT; 2506 error = -EFAULT;
2506 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2507 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2508 } 2512 }
2509 2513
2510 error = 0;
2511out: 2514out:
2512 return error; 2515 return error;
2513} 2516}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..94188b8ecc33 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b41fb710e114..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -213,6 +213,7 @@ restart:
213 do { 213 do {
214 if (pending & 1) { 214 if (pending & 1) {
215 int prev_count = preempt_count(); 215 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
216 217
217 trace_softirq_entry(h, softirq_vec); 218 trace_softirq_entry(h, softirq_vec);
218 h->action(h); 219 h->action(h);
@@ -344,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
344 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
345} 346}
346 347
347/* Tasklets */ 348/*
349 * Tasklets
350 */
348struct tasklet_head 351struct tasklet_head
349{ 352{
350 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -492,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
492 495
493EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
494 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
495DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
496EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
497 560
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab462b9968d5..58be76017fd0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/acpi.h> 49#include <linux/acpi.h>
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/security.h>
52#include <linux/slow-work.h> 53#include <linux/slow-work.h>
53#include <linux/perf_counter.h> 54#include <linux/perf_counter.h>
54 55
@@ -335,7 +336,10 @@ static struct ctl_table kern_table[] = {
335 .data = &sysctl_timer_migration, 336 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 337 .maxlen = sizeof(unsigned int),
337 .mode = 0644, 338 .mode = 0644,
338 .proc_handler = &proc_dointvec, 339 .proc_handler = &proc_dointvec_minmax,
340 .strategy = &sysctl_intvec,
341 .extra1 = &zero,
342 .extra2 = &one,
339 }, 343 },
340#endif 344#endif
341 { 345 {
@@ -744,6 +748,14 @@ static struct ctl_table kern_table[] = {
744 .proc_handler = &proc_dointvec, 748 .proc_handler = &proc_dointvec,
745 }, 749 },
746 { 750 {
751 .ctl_name = CTL_UNNUMBERED,
752 .procname = "panic_on_io_nmi",
753 .data = &panic_on_io_nmi,
754 .maxlen = sizeof(int),
755 .mode = 0644,
756 .proc_handler = &proc_dointvec,
757 },
758 {
747 .ctl_name = KERN_BOOTLOADER_TYPE, 759 .ctl_name = KERN_BOOTLOADER_TYPE,
748 .procname = "bootloader_type", 760 .procname = "bootloader_type",
749 .data = &bootloader_type, 761 .data = &bootloader_type,
@@ -1295,10 +1307,10 @@ static struct ctl_table vm_table[] = {
1295 { 1307 {
1296 .ctl_name = CTL_UNNUMBERED, 1308 .ctl_name = CTL_UNNUMBERED,
1297 .procname = "mmap_min_addr", 1309 .procname = "mmap_min_addr",
1298 .data = &mmap_min_addr, 1310 .data = &dac_mmap_min_addr,
1299 .maxlen = sizeof(unsigned long), 1311 .maxlen = sizeof(unsigned long),
1300 .mode = 0644, 1312 .mode = 0644,
1301 .proc_handler = &proc_doulongvec_minmax, 1313 .proc_handler = &mmap_min_addr_handler,
1302 }, 1314 },
1303#ifdef CONFIG_NUMA 1315#ifdef CONFIG_NUMA
1304 { 1316 {
@@ -2283,7 +2295,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2283 void *data) 2295 void *data)
2284{ 2296{
2285#define TMPBUFLEN 21 2297#define TMPBUFLEN 21
2286 int *i, vleft, first=1, neg, val; 2298 int *i, vleft, first = 1, neg;
2287 unsigned long lval; 2299 unsigned long lval;
2288 size_t left, len; 2300 size_t left, len;
2289 2301
@@ -2336,8 +2348,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2336 len = p-buf; 2348 len = p-buf;
2337 if ((len < left) && *p && !isspace(*p)) 2349 if ((len < left) && *p && !isspace(*p))
2338 break; 2350 break;
2339 if (neg)
2340 val = -val;
2341 s += len; 2351 s += len;
2342 left -= len; 2352 left -= len;
2343 2353
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..620b58abdc32 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
137 */ 137 */
138int clockevents_register_notifier(struct notifier_block *nb) 138int clockevents_register_notifier(struct notifier_block *nb)
139{ 139{
140 unsigned long flags;
140 int ret; 141 int ret;
141 142
142 spin_lock(&clockevents_lock); 143 spin_lock_irqsave(&clockevents_lock, flags);
143 ret = raw_notifier_chain_register(&clockevents_chain, nb); 144 ret = raw_notifier_chain_register(&clockevents_chain, nb);
144 spin_unlock(&clockevents_lock); 145 spin_unlock_irqrestore(&clockevents_lock, flags);
145 146
146 return ret; 147 return ret;
147} 148}
@@ -178,16 +179,18 @@ static void clockevents_notify_released(void)
178 */ 179 */
179void clockevents_register_device(struct clock_event_device *dev) 180void clockevents_register_device(struct clock_event_device *dev)
180{ 181{
182 unsigned long flags;
183
181 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
182 BUG_ON(!dev->cpumask); 185 BUG_ON(!dev->cpumask);
183 186
184 spin_lock(&clockevents_lock); 187 spin_lock_irqsave(&clockevents_lock, flags);
185 188
186 list_add(&dev->list, &clockevent_devices); 189 list_add(&dev->list, &clockevent_devices);
187 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
188 clockevents_notify_released(); 191 clockevents_notify_released();
189 192
190 spin_unlock(&clockevents_lock); 193 spin_unlock_irqrestore(&clockevents_lock, flags);
191} 194}
192EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
193 196
@@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
235void clockevents_notify(unsigned long reason, void *arg) 238void clockevents_notify(unsigned long reason, void *arg)
236{ 239{
237 struct list_head *node, *tmp; 240 struct list_head *node, *tmp;
241 unsigned long flags;
238 242
239 spin_lock(&clockevents_lock); 243 spin_lock_irqsave(&clockevents_lock, flags);
240 clockevents_do_notify(reason, arg); 244 clockevents_do_notify(reason, arg);
241 245
242 switch (reason) { 246 switch (reason) {
@@ -251,18 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
251 default: 255 default:
252 break; 256 break;
253 } 257 }
254 spin_unlock(&clockevents_lock); 258 spin_unlock_irqrestore(&clockevents_lock, flags);
255} 259}
256EXPORT_SYMBOL_GPL(clockevents_notify); 260EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 261#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
513 * Check to make sure we don't switch to a non-highres capable 513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz) 514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */ 515 */
516 if (tick_oneshot_mode_active() && 516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { 517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. " 518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name); 519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 877dbedc3118..c2ec25087a35 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
205 * Powerstate information: The system enters/leaves a state, where 205 * Powerstate information: The system enters/leaves a state, where
206 * affected devices might stop 206 * affected devices might stop
207 */ 207 */
208static void tick_do_broadcast_on_off(void *why) 208static void tick_do_broadcast_on_off(unsigned long *reason)
209{ 209{
210 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
211 struct tick_device *td; 211 struct tick_device *td;
212 unsigned long flags, *reason = why; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
279 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 279 tick_do_broadcast_on_off(&reason);
280 &reason, 1);
281} 280}
282 281
283/* 282/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2aff39c6f10c..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
222 222
223 cpu = smp_processor_id(); 223 cpu = smp_processor_id();
224 ts = &per_cpu(tick_cpu_sched, cpu); 224 ts = &per_cpu(tick_cpu_sched, cpu);
225
226 /*
227 * Call to tick_nohz_start_idle stops the last_update_time from being
228 * updated. Thus, it must not be called in the event we are called from
229 * irq_exit() with the prior state different than idle.
230 */
231 if (!inidle && !ts->inidle)
232 goto end;
233
225 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
226 235
227 /* 236 /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
240 goto end; 249 goto end;
241 250
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1; 251 ts->inidle = 1;
246 252
247 if (need_resched()) 253 if (need_resched())
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a1277..fddd69d16e03 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
286{ 286{
287 struct proc_dir_entry *pe; 287 struct proc_dir_entry *pe;
288 288
289 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); 289 pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
290 if (!pe) 290 if (!pe)
291 return -ENOMEM; 291 return -ENOMEM;
292 return 0; 292 return 0;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
@@ -712,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
712 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
713 * to be the same thing then just return: 715 * to be the same thing then just return:
714 */ 716 */
715 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
716 return 1; 718 return 1;
717 719
718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ae048a2dbbe8..5efeb4229ea0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
18config HAVE_FUNCTION_GRAPH_TRACER 18config HAVE_FUNCTION_GRAPH_TRACER
19 bool 19 bool
20 20
21config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool
23 help
24 An arch may pass in a unique value (frame pointer) to both the
25 entering and exiting of a function. On exit, the value is compared
26 and if it does not match, then it will panic the kernel.
27
21config HAVE_FUNCTION_TRACE_MCOUNT_TEST 28config HAVE_FUNCTION_TRACE_MCOUNT_TEST
22 bool 29 bool
23 help 30 help
@@ -34,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD
34config HAVE_HW_BRANCH_TRACER 41config HAVE_HW_BRANCH_TRACER
35 bool 42 bool
36 43
37config HAVE_FTRACE_SYSCALLS 44config HAVE_SYSCALL_TRACEPOINTS
38 bool 45 bool
39 46
40config TRACER_MAX_TRACE 47config TRACER_MAX_TRACE
@@ -53,9 +60,14 @@ config EVENT_TRACING
53 bool 60 bool
54 61
55config CONTEXT_SWITCH_TRACER 62config CONTEXT_SWITCH_TRACER
56 select MARKERS
57 bool 63 bool
58 64
65config RING_BUFFER_ALLOW_SWAP
66 bool
67 help
68 Allow the use of ring_buffer_swap_cpu.
69 Adds a very slight overhead to tracing when enabled.
70
59# All tracer options should select GENERIC_TRACER. For those options that are 71# All tracer options should select GENERIC_TRACER. For those options that are
60# enabled by all tracers (context switch and event tracer) they select TRACING. 72# enabled by all tracers (context switch and event tracer) they select TRACING.
61# This allows those options to appear when no other tracer is selected. But the 73# This allows those options to appear when no other tracer is selected. But the
@@ -121,6 +133,7 @@ config FUNCTION_GRAPH_TRACER
121 bool "Kernel Function Graph Tracer" 133 bool "Kernel Function Graph Tracer"
122 depends on HAVE_FUNCTION_GRAPH_TRACER 134 depends on HAVE_FUNCTION_GRAPH_TRACER
123 depends on FUNCTION_TRACER 135 depends on FUNCTION_TRACER
136 depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
124 default y 137 default y
125 help 138 help
126 Enable the kernel to trace a function at both its return 139 Enable the kernel to trace a function at both its return
@@ -139,6 +152,7 @@ config IRQSOFF_TRACER
139 select TRACE_IRQFLAGS 152 select TRACE_IRQFLAGS
140 select GENERIC_TRACER 153 select GENERIC_TRACER
141 select TRACER_MAX_TRACE 154 select TRACER_MAX_TRACE
155 select RING_BUFFER_ALLOW_SWAP
142 help 156 help
143 This option measures the time spent in irqs-off critical 157 This option measures the time spent in irqs-off critical
144 sections, with microsecond accuracy. 158 sections, with microsecond accuracy.
@@ -160,6 +174,7 @@ config PREEMPT_TRACER
160 depends on PREEMPT 174 depends on PREEMPT
161 select GENERIC_TRACER 175 select GENERIC_TRACER
162 select TRACER_MAX_TRACE 176 select TRACER_MAX_TRACE
177 select RING_BUFFER_ALLOW_SWAP
163 help 178 help
164 This option measures the time spent in preemption off critical 179 This option measures the time spent in preemption off critical
165 sections, with microsecond accuracy. 180 sections, with microsecond accuracy.
@@ -203,7 +218,7 @@ config ENABLE_DEFAULT_TRACERS
203 218
204config FTRACE_SYSCALLS 219config FTRACE_SYSCALLS
205 bool "Trace syscalls" 220 bool "Trace syscalls"
206 depends on HAVE_FTRACE_SYSCALLS 221 depends on HAVE_SYSCALL_TRACEPOINTS
207 select GENERIC_TRACER 222 select GENERIC_TRACER
208 select KALLSYMS 223 select KALLSYMS
209 help 224 help
@@ -218,13 +233,13 @@ config BOOT_TRACER
218 the timings of the initcalls and traces key events and the identity 233 the timings of the initcalls and traces key events and the identity
219 of tasks that can cause boot delays, such as context-switches. 234 of tasks that can cause boot delays, such as context-switches.
220 235
221 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 236 Its aim is to be parsed by the scripts/bootgraph.pl tool to
222 produce pretty graphics about boot inefficiencies, giving a visual 237 produce pretty graphics about boot inefficiencies, giving a visual
223 representation of the delays during initcalls - but the raw 238 representation of the delays during initcalls - but the raw
224 /debug/tracing/trace text output is readable too. 239 /debug/tracing/trace text output is readable too.
225 240
226 You must pass in ftrace=initcall to the kernel command line 241 You must pass in initcall_debug and ftrace=initcall to the kernel
227 to enable this on bootup. 242 command line to enable this on bootup.
228 243
229config TRACE_BRANCH_PROFILING 244config TRACE_BRANCH_PROFILING
230 bool 245 bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
@@ -64,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
64{ 65{
65 struct blk_io_trace *t; 66 struct blk_io_trace *t;
66 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
67 int pc = 0; 69 int pc = 0;
68 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
69 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
70 72
71 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
72 pc = preempt_count(); 75 pc = preempt_count();
73 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
74 sizeof(*t) + len, 77 sizeof(*t) + len,
75 0, pc); 78 0, pc);
76 if (!event) 79 if (!event)
@@ -95,7 +98,7 @@ record_it:
95 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
96 99
97 if (blk_tracer) 100 if (blk_tracer)
98 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
99 } 102 }
100} 103}
101 104
@@ -178,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
178{ 181{
179 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
180 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
181 struct blk_io_trace *t; 185 struct blk_io_trace *t;
182 unsigned long flags = 0; 186 unsigned long flags = 0;
183 unsigned long *sequence; 187 unsigned long *sequence;
@@ -203,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
203 if (blk_tracer) { 207 if (blk_tracer) {
204 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
205 209
210 buffer = blk_tr->buffer;
206 pc = preempt_count(); 211 pc = preempt_count();
207 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
208 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
209 0, pc); 214 0, pc);
210 if (!event) 215 if (!event)
@@ -251,7 +256,7 @@ record_it:
251 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
252 257
253 if (blk_tracer) { 258 if (blk_tracer) {
254 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
255 return; 260 return;
256 } 261 }
257 } 262 }
@@ -266,8 +271,8 @@ static void blk_trace_free(struct blk_trace *bt)
266{ 271{
267 debugfs_remove(bt->msg_file); 272 debugfs_remove(bt->msg_file);
268 debugfs_remove(bt->dropped_file); 273 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
270 relay_close(bt->rchan); 274 relay_close(bt->rchan);
275 debugfs_remove(bt->dir);
271 free_percpu(bt->sequence); 276 free_percpu(bt->sequence);
272 free_percpu(bt->msg_data); 277 free_percpu(bt->msg_data);
273 kfree(bt); 278 kfree(bt);
@@ -377,18 +382,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
377 382
378static int blk_remove_buf_file_callback(struct dentry *dentry) 383static int blk_remove_buf_file_callback(struct dentry *dentry)
379{ 384{
380 struct dentry *parent = dentry->d_parent;
381 debugfs_remove(dentry); 385 debugfs_remove(dentry);
382 386
383 /*
384 * this will fail for all but the last file, but that is ok. what we
385 * care about is the top level buts->name directory going away, when
386 * the last trace file is gone. Then we don't have to rmdir() that
387 * manually on trace stop, so it nicely solves the issue with
388 * force killing of running traces.
389 */
390
391 debugfs_remove(parent);
392 return 0; 387 return 0;
393} 388}
394 389
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb60732ade0c..8c804e24f96f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -766,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
766 .stat_show = function_stat_show 768 .stat_show = function_stat_show
767}; 769};
768 770
769static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
770{ 772{
771 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
772 struct dentry *entry; 774 struct dentry *entry;
@@ -784,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
784 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
785 * we still do not free memory. 787 * we still do not free memory.
786 */ 788 */
787 kfree(stat);
788 WARN(1, 789 WARN(1,
789 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
790 cpu); 791 cpu);
@@ -811,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
811} 812}
812 813
813#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
815{ 816{
816} 817}
817#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1015,71 +1016,35 @@ static int
1015__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1016{ 1017{
1017 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1018 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1019 1020
1020 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1021 1022
1022 ip = rec->ip;
1023
1024 /* 1023 /*
1025 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1026 * it is not enabled then do nothing. 1025 * then disable it.
1027 * 1026 *
1028 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1029 * it is enabled then disable it.
1030 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1031 */ 1031 */
1032 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1033 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1034 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1035 else 1035 }
1036 return 0;
1037
1038 } else if (ftrace_filtered && enable) {
1039 /*
1040 * Filtering is on:
1041 */
1042
1043 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1044
1045 /* Record is filtered and enabled, do nothing */
1046 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1047 return 0;
1048
1049 /* Record is not filtered or enabled, do nothing */
1050 if (!fl)
1051 return 0;
1052
1053 /* Record is not filtered but enabled, disable it */
1054 if (fl == FTRACE_FL_ENABLED)
1055 rec->flags &= ~FTRACE_FL_ENABLED;
1056 else
1057 /* Otherwise record is filtered but not enabled, enable it */
1058 rec->flags |= FTRACE_FL_ENABLED;
1059 } else {
1060 /* Disable or not filtered */
1061
1062 if (enable) {
1063 /* if record is enabled, do nothing */
1064 if (rec->flags & FTRACE_FL_ENABLED)
1065 return 0;
1066
1067 rec->flags |= FTRACE_FL_ENABLED;
1068
1069 } else {
1070 1036
1071 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1072 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1073 return 0; 1039 return 0;
1074 1040
1075 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1076 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1077 } 1044 }
1078 1045
1079 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1080 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1081 else
1082 return ftrace_make_nop(NULL, rec, ftrace_addr);
1083} 1048}
1084 1049
1085static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1224,6 +1189,13 @@ static void ftrace_shutdown(int command)
1224 return; 1189 return;
1225 1190
1226 ftrace_start_up--; 1191 ftrace_start_up--;
1192 /*
1193 * Just warn in case of unbalance, no need to kill ftrace, it's not
1194 * critical but the ftrace_call callers may be never nopped again after
1195 * further ftrace uses.
1196 */
1197 WARN_ON_ONCE(ftrace_start_up < 0);
1198
1227 if (!ftrace_start_up) 1199 if (!ftrace_start_up)
1228 command |= FTRACE_DISABLE_CALLS; 1200 command |= FTRACE_DISABLE_CALLS;
1229 1201
@@ -1367,7 +1339,6 @@ struct ftrace_iterator {
1367 unsigned flags; 1339 unsigned flags;
1368 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1340 unsigned char buffer[FTRACE_BUFF_MAX+1];
1369 unsigned buffer_idx; 1341 unsigned buffer_idx;
1370 unsigned filtered;
1371}; 1342};
1372 1343
1373static void * 1344static void *
@@ -1410,28 +1381,33 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410{ 1381{
1411 struct ftrace_iterator *iter = m->private; 1382 struct ftrace_iterator *iter = m->private;
1412 void *p = NULL; 1383 void *p = NULL;
1384 loff_t l;
1385
1386 if (!(iter->flags & FTRACE_ITER_HASH))
1387 *pos = 0;
1413 1388
1414 iter->flags |= FTRACE_ITER_HASH; 1389 iter->flags |= FTRACE_ITER_HASH;
1415 1390
1416 return t_hash_next(m, p, pos); 1391 iter->hidx = 0;
1392 for (l = 0; l <= *pos; ) {
1393 p = t_hash_next(m, p, &l);
1394 if (!p)
1395 break;
1396 }
1397 return p;
1417} 1398}
1418 1399
1419static int t_hash_show(struct seq_file *m, void *v) 1400static int t_hash_show(struct seq_file *m, void *v)
1420{ 1401{
1421 struct ftrace_func_probe *rec; 1402 struct ftrace_func_probe *rec;
1422 struct hlist_node *hnd = v; 1403 struct hlist_node *hnd = v;
1423 char str[KSYM_SYMBOL_LEN];
1424 1404
1425 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1405 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1426 1406
1427 if (rec->ops->print) 1407 if (rec->ops->print)
1428 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1429 1409
1430 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
1431 seq_printf(m, "%s:", str);
1432
1433 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1434 seq_printf(m, "%s", str);
1435 1411
1436 if (rec->data) 1412 if (rec->data)
1437 seq_printf(m, ":%p", rec->data); 1413 seq_printf(m, ":%p", rec->data);
@@ -1460,8 +1436,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1460 iter->pg = iter->pg->next; 1436 iter->pg = iter->pg->next;
1461 iter->idx = 0; 1437 iter->idx = 0;
1462 goto retry; 1438 goto retry;
1463 } else {
1464 iter->idx = -1;
1465 } 1439 }
1466 } else { 1440 } else {
1467 rec = &iter->pg->records[iter->idx++]; 1441 rec = &iter->pg->records[iter->idx++];
@@ -1490,6 +1464,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1490{ 1464{
1491 struct ftrace_iterator *iter = m->private; 1465 struct ftrace_iterator *iter = m->private;
1492 void *p = NULL; 1466 void *p = NULL;
1467 loff_t l;
1493 1468
1494 mutex_lock(&ftrace_lock); 1469 mutex_lock(&ftrace_lock);
1495 /* 1470 /*
@@ -1501,23 +1476,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1501 if (*pos > 0) 1476 if (*pos > 0)
1502 return t_hash_start(m, pos); 1477 return t_hash_start(m, pos);
1503 iter->flags |= FTRACE_ITER_PRINTALL; 1478 iter->flags |= FTRACE_ITER_PRINTALL;
1504 (*pos)++;
1505 return iter; 1479 return iter;
1506 } 1480 }
1507 1481
1508 if (iter->flags & FTRACE_ITER_HASH) 1482 if (iter->flags & FTRACE_ITER_HASH)
1509 return t_hash_start(m, pos); 1483 return t_hash_start(m, pos);
1510 1484
1511 if (*pos > 0) { 1485 iter->pg = ftrace_pages_start;
1512 if (iter->idx < 0) 1486 iter->idx = 0;
1513 return p; 1487 for (l = 0; l <= *pos; ) {
1514 (*pos)--; 1488 p = t_next(m, p, &l);
1515 iter->idx--; 1489 if (!p)
1490 break;
1516 } 1491 }
1517 1492
1518 p = t_next(m, p, pos); 1493 if (!p && iter->flags & FTRACE_ITER_FILTER)
1519
1520 if (!p)
1521 return t_hash_start(m, pos); 1494 return t_hash_start(m, pos);
1522 1495
1523 return p; 1496 return p;
@@ -1532,7 +1505,6 @@ static int t_show(struct seq_file *m, void *v)
1532{ 1505{
1533 struct ftrace_iterator *iter = m->private; 1506 struct ftrace_iterator *iter = m->private;
1534 struct dyn_ftrace *rec = v; 1507 struct dyn_ftrace *rec = v;
1535 char str[KSYM_SYMBOL_LEN];
1536 1508
1537 if (iter->flags & FTRACE_ITER_HASH) 1509 if (iter->flags & FTRACE_ITER_HASH)
1538 return t_hash_show(m, v); 1510 return t_hash_show(m, v);
@@ -1545,9 +1517,7 @@ static int t_show(struct seq_file *m, void *v)
1545 if (!rec) 1517 if (!rec)
1546 return 0; 1518 return 0;
1547 1519
1548 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1520 seq_printf(m, "%pf\n", (void *)rec->ip);
1549
1550 seq_printf(m, "%s\n", str);
1551 1521
1552 return 0; 1522 return 0;
1553} 1523}
@@ -1586,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1586 return ret; 1556 return ret;
1587} 1557}
1588 1558
1589int ftrace_avail_release(struct inode *inode, struct file *file)
1590{
1591 struct seq_file *m = (struct seq_file *)file->private_data;
1592 struct ftrace_iterator *iter = m->private;
1593
1594 seq_release(inode, file);
1595 kfree(iter);
1596
1597 return 0;
1598}
1599
1600static int 1559static int
1601ftrace_failures_open(struct inode *inode, struct file *file) 1560ftrace_failures_open(struct inode *inode, struct file *file)
1602{ 1561{
@@ -1647,7 +1606,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1647 1606
1648 mutex_lock(&ftrace_regex_lock); 1607 mutex_lock(&ftrace_regex_lock);
1649 if ((file->f_mode & FMODE_WRITE) && 1608 if ((file->f_mode & FMODE_WRITE) &&
1650 !(file->f_flags & O_APPEND)) 1609 (file->f_flags & O_TRUNC))
1651 ftrace_filter_reset(enable); 1610 ftrace_filter_reset(enable);
1652 1611
1653 if (file->f_mode & FMODE_READ) { 1612 if (file->f_mode & FMODE_READ) {
@@ -2263,7 +2222,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2263 read++; 2222 read++;
2264 cnt--; 2223 cnt--;
2265 2224
2266 if (!(iter->flags & ~FTRACE_ITER_CONT)) { 2225 /*
2226 * If the parser haven't finished with the last write,
2227 * continue reading the user input without skipping spaces.
2228 */
2229 if (!(iter->flags & FTRACE_ITER_CONT)) {
2267 /* skip white space */ 2230 /* skip white space */
2268 while (cnt && isspace(ch)) { 2231 while (cnt && isspace(ch)) {
2269 ret = get_user(ch, ubuf++); 2232 ret = get_user(ch, ubuf++);
@@ -2273,8 +2236,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2273 cnt--; 2236 cnt--;
2274 } 2237 }
2275 2238
2239 /* only spaces were written */
2276 if (isspace(ch)) { 2240 if (isspace(ch)) {
2277 file->f_pos += read; 2241 *ppos += read;
2278 ret = read; 2242 ret = read;
2279 goto out; 2243 goto out;
2280 } 2244 }
@@ -2297,19 +2261,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2297 } 2261 }
2298 2262
2299 if (isspace(ch)) { 2263 if (isspace(ch)) {
2300 iter->filtered++;
2301 iter->buffer[iter->buffer_idx] = 0; 2264 iter->buffer[iter->buffer_idx] = 0;
2302 ret = ftrace_process_regex(iter->buffer, 2265 ret = ftrace_process_regex(iter->buffer,
2303 iter->buffer_idx, enable); 2266 iter->buffer_idx, enable);
2304 if (ret) 2267 if (ret)
2305 goto out; 2268 goto out;
2306 iter->buffer_idx = 0; 2269 iter->buffer_idx = 0;
2307 } else 2270 } else {
2308 iter->flags |= FTRACE_ITER_CONT; 2271 iter->flags |= FTRACE_ITER_CONT;
2272 iter->buffer[iter->buffer_idx++] = ch;
2273 }
2309 2274
2310 2275 *ppos += read;
2311 file->f_pos += read;
2312
2313 ret = read; 2276 ret = read;
2314 out: 2277 out:
2315 mutex_unlock(&ftrace_regex_lock); 2278 mutex_unlock(&ftrace_regex_lock);
@@ -2428,7 +2391,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2428 iter = file->private_data; 2391 iter = file->private_data;
2429 2392
2430 if (iter->buffer_idx) { 2393 if (iter->buffer_idx) {
2431 iter->filtered++;
2432 iter->buffer[iter->buffer_idx] = 0; 2394 iter->buffer[iter->buffer_idx] = 0;
2433 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
2434 } 2396 }
@@ -2459,14 +2421,14 @@ static const struct file_operations ftrace_avail_fops = {
2459 .open = ftrace_avail_open, 2421 .open = ftrace_avail_open,
2460 .read = seq_read, 2422 .read = seq_read,
2461 .llseek = seq_lseek, 2423 .llseek = seq_lseek,
2462 .release = ftrace_avail_release, 2424 .release = seq_release_private,
2463}; 2425};
2464 2426
2465static const struct file_operations ftrace_failures_fops = { 2427static const struct file_operations ftrace_failures_fops = {
2466 .open = ftrace_failures_open, 2428 .open = ftrace_failures_open,
2467 .read = seq_read, 2429 .read = seq_read,
2468 .llseek = seq_lseek, 2430 .llseek = seq_lseek,
2469 .release = ftrace_avail_release, 2431 .release = seq_release_private,
2470}; 2432};
2471 2433
2472static const struct file_operations ftrace_filter_fops = { 2434static const struct file_operations ftrace_filter_fops = {
@@ -2493,32 +2455,31 @@ int ftrace_graph_count;
2493unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2455unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2494 2456
2495static void * 2457static void *
2496g_next(struct seq_file *m, void *v, loff_t *pos) 2458__g_next(struct seq_file *m, loff_t *pos)
2497{ 2459{
2498 unsigned long *array = m->private; 2460 unsigned long *array = m->private;
2499 int index = *pos;
2500
2501 (*pos)++;
2502 2461
2503 if (index >= ftrace_graph_count) 2462 if (*pos >= ftrace_graph_count)
2504 return NULL; 2463 return NULL;
2464 return &array[*pos];
2465}
2505 2466
2506 return &array[index]; 2467static void *
2468g_next(struct seq_file *m, void *v, loff_t *pos)
2469{
2470 (*pos)++;
2471 return __g_next(m, pos);
2507} 2472}
2508 2473
2509static void *g_start(struct seq_file *m, loff_t *pos) 2474static void *g_start(struct seq_file *m, loff_t *pos)
2510{ 2475{
2511 void *p = NULL;
2512
2513 mutex_lock(&graph_lock); 2476 mutex_lock(&graph_lock);
2514 2477
2515 /* Nothing, tell g_show to print all functions are enabled */ 2478 /* Nothing, tell g_show to print all functions are enabled */
2516 if (!ftrace_graph_count && !*pos) 2479 if (!ftrace_graph_count && !*pos)
2517 return (void *)1; 2480 return (void *)1;
2518 2481
2519 p = g_next(m, p, pos); 2482 return __g_next(m, pos);
2520
2521 return p;
2522} 2483}
2523 2484
2524static void g_stop(struct seq_file *m, void *p) 2485static void g_stop(struct seq_file *m, void *p)
@@ -2529,7 +2490,6 @@ static void g_stop(struct seq_file *m, void *p)
2529static int g_show(struct seq_file *m, void *v) 2490static int g_show(struct seq_file *m, void *v)
2530{ 2491{
2531 unsigned long *ptr = v; 2492 unsigned long *ptr = v;
2532 char str[KSYM_SYMBOL_LEN];
2533 2493
2534 if (!ptr) 2494 if (!ptr)
2535 return 0; 2495 return 0;
@@ -2539,9 +2499,7 @@ static int g_show(struct seq_file *m, void *v)
2539 return 0; 2499 return 0;
2540 } 2500 }
2541 2501
2542 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2502 seq_printf(m, "%pf\n", v);
2543
2544 seq_printf(m, "%s\n", str);
2545 2503
2546 return 0; 2504 return 0;
2547} 2505}
@@ -2563,7 +2521,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2563 2521
2564 mutex_lock(&graph_lock); 2522 mutex_lock(&graph_lock);
2565 if ((file->f_mode & FMODE_WRITE) && 2523 if ((file->f_mode & FMODE_WRITE) &&
2566 !(file->f_flags & O_APPEND)) { 2524 (file->f_flags & O_TRUNC)) {
2567 ftrace_graph_count = 0; 2525 ftrace_graph_count = 0;
2568 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2526 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2569 } 2527 }
@@ -2582,6 +2540,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2582} 2540}
2583 2541
2584static int 2542static int
2543ftrace_graph_release(struct inode *inode, struct file *file)
2544{
2545 if (file->f_mode & FMODE_READ)
2546 seq_release(inode, file);
2547 return 0;
2548}
2549
2550static int
2585ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2551ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2586{ 2552{
2587 struct dyn_ftrace *rec; 2553 struct dyn_ftrace *rec;
@@ -2710,9 +2676,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2710} 2676}
2711 2677
2712static const struct file_operations ftrace_graph_fops = { 2678static const struct file_operations ftrace_graph_fops = {
2713 .open = ftrace_graph_open, 2679 .open = ftrace_graph_open,
2714 .read = seq_read, 2680 .read = seq_read,
2715 .write = ftrace_graph_write, 2681 .write = ftrace_graph_write,
2682 .release = ftrace_graph_release,
2716}; 2683};
2717#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2684#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2718 2685
@@ -3145,10 +3112,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3145 3112
3146 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3113 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3147 3114
3148 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3115 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3149 goto out; 3116 goto out;
3150 3117
3151 last_ftrace_enabled = ftrace_enabled; 3118 last_ftrace_enabled = !!ftrace_enabled;
3152 3119
3153 if (ftrace_enabled) { 3120 if (ftrace_enabled) {
3154 3121
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 86cdf671d7e2..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu_mask(cpu, cpu_possible_map) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc4dc70171ce..454e74e718cf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
209 210
210/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
211#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -217,17 +218,12 @@ enum {
217 218
218static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
219{ 220{
220 return event->type_len == RINGBUF_TYPE_PADDING 221 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
221 && event->time_delta == 0;
222}
223
224static inline int rb_discarded_event(struct ring_buffer_event *event)
225{
226 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
227} 222}
228 223
229static void rb_event_set_padding(struct ring_buffer_event *event) 224static void rb_event_set_padding(struct ring_buffer_event *event)
230{ 225{
226 /* padding has a NULL time_delta */
231 event->type_len = RINGBUF_TYPE_PADDING; 227 event->type_len = RINGBUF_TYPE_PADDING;
232 event->time_delta = 0; 228 event->time_delta = 0;
233} 229}
@@ -321,6 +317,14 @@ struct buffer_data_page {
321 unsigned char data[]; /* data of buffer page */ 317 unsigned char data[]; /* data of buffer page */
322}; 318};
323 319
320/*
321 * Note, the buffer_page list must be first. The buffer pages
322 * are allocated in cache lines, which means that each buffer
323 * page will be at the beginning of a cache line, and thus
324 * the least significant bits will be zero. We use this to
325 * add flags in the list struct pointers, to make the ring buffer
326 * lockless.
327 */
324struct buffer_page { 328struct buffer_page {
325 struct list_head list; /* list of buffer pages */ 329 struct list_head list; /* list of buffer pages */
326 local_t write; /* index for next write */ 330 local_t write; /* index for next write */
@@ -329,6 +333,21 @@ struct buffer_page {
329 struct buffer_data_page *page; /* Actual data page */ 333 struct buffer_data_page *page; /* Actual data page */
330}; 334};
331 335
336/*
337 * The buffer page counters, write and entries, must be reset
338 * atomically when crossing page boundaries. To synchronize this
339 * update, two counters are inserted into the number. One is
340 * the actual counter for the write position or count on the page.
341 *
342 * The other is a counter of updaters. Before an update happens
343 * the update partition of the counter is incremented. This will
344 * allow the updater to update the counter atomically.
345 *
346 * The counter is 20 bits, and the state data is 12.
347 */
348#define RB_WRITE_MASK 0xfffff
349#define RB_WRITE_INTCNT (1 << 20)
350
332static void rb_init_page(struct buffer_data_page *bpage) 351static void rb_init_page(struct buffer_data_page *bpage)
333{ 352{
334 local_set(&bpage->commit, 0); 353 local_set(&bpage->commit, 0);
@@ -402,19 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
402struct ring_buffer_per_cpu { 421struct ring_buffer_per_cpu {
403 int cpu; 422 int cpu;
404 struct ring_buffer *buffer; 423 struct ring_buffer *buffer;
405 spinlock_t reader_lock; /* serialize readers */ 424 spinlock_t reader_lock; /* serialize readers */
406 raw_spinlock_t lock; 425 raw_spinlock_t lock;
407 struct lock_class_key lock_key; 426 struct lock_class_key lock_key;
408 struct list_head pages; 427 struct list_head *pages;
409 struct buffer_page *head_page; /* read from head */ 428 struct buffer_page *head_page; /* read from head */
410 struct buffer_page *tail_page; /* write to tail */ 429 struct buffer_page *tail_page; /* write to tail */
411 struct buffer_page *commit_page; /* committed pages */ 430 struct buffer_page *commit_page; /* committed pages */
412 struct buffer_page *reader_page; 431 struct buffer_page *reader_page;
413 unsigned long nmi_dropped; 432 local_t commit_overrun;
414 unsigned long commit_overrun; 433 local_t overrun;
415 unsigned long overrun;
416 unsigned long read;
417 local_t entries; 434 local_t entries;
435 local_t committing;
436 local_t commits;
437 unsigned long read;
418 u64 write_stamp; 438 u64 write_stamp;
419 u64 read_stamp; 439 u64 read_stamp;
420 atomic_t record_disabled; 440 atomic_t record_disabled;
@@ -447,14 +467,19 @@ struct ring_buffer_iter {
447}; 467};
448 468
449/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 469/* buffer may be either ring_buffer or ring_buffer_per_cpu */
450#define RB_WARN_ON(buffer, cond) \ 470#define RB_WARN_ON(b, cond) \
451 ({ \ 471 ({ \
452 int _____ret = unlikely(cond); \ 472 int _____ret = unlikely(cond); \
453 if (_____ret) { \ 473 if (_____ret) { \
454 atomic_inc(&buffer->record_disabled); \ 474 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
455 WARN_ON(1); \ 475 struct ring_buffer_per_cpu *__b = \
456 } \ 476 (void *)b; \
457 _____ret; \ 477 atomic_inc(&__b->buffer->record_disabled); \
478 } else \
479 atomic_inc(&b->record_disabled); \
480 WARN_ON(1); \
481 } \
482 _____ret; \
458 }) 483 })
459 484
460/* Up this if you want to test the TIME_EXTENTS and normalization */ 485/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -486,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
486} 511}
487EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
488 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = (unsigned long)cmpxchg(&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, &new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
489/** 898/**
490 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
491 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -495,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
495 */ 904 */
496static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
497{ 906{
498 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
499 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
500 909
910 rb_head_page_deactivate(cpu_buffer);
911
501 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
502 return -1; 913 return -1;
503 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
504 return -1; 915 return -1;
505 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
506 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
507 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
508 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -510,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
510 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
512 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
513 } 929 }
514 930
931 rb_head_page_activate(cpu_buffer);
932
515 return 0; 933 return 0;
516} 934}
517 935
518static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
519 unsigned nr_pages) 937 unsigned nr_pages)
520{ 938{
521 struct list_head *head = &cpu_buffer->pages;
522 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
523 unsigned long addr; 940 unsigned long addr;
524 LIST_HEAD(pages); 941 LIST_HEAD(pages);
525 unsigned i; 942 unsigned i;
526 943
944 WARN_ON(!nr_pages);
945
527 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
528 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
529 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
530 if (!bpage) 949 if (!bpage)
531 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
532 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
533 955
534 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -538,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
538 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
539 } 961 }
540 962
541 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
542 970
543 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
544 972
@@ -570,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
570 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
571 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
572 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
573 INIT_LIST_HEAD(&cpu_buffer->pages);
574 1001
575 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
576 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
577 if (!bpage) 1004 if (!bpage)
578 goto fail_free_buffer; 1005 goto fail_free_buffer;
579 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
580 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
581 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
582 if (!addr) 1011 if (!addr)
@@ -591,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
591 goto fail_free_reader; 1020 goto fail_free_reader;
592 1021
593 cpu_buffer->head_page 1022 cpu_buffer->head_page
594 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
595 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
596 1025
1026 rb_head_page_activate(cpu_buffer);
1027
597 return cpu_buffer; 1028 return cpu_buffer;
598 1029
599 fail_free_reader: 1030 fail_free_reader:
@@ -606,24 +1037,25 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
606 1037
607static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
608{ 1039{
609 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
610 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
611 1042
612 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
613 1044
614 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
615 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
616 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
617 } 1054 }
1055
618 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
619} 1057}
620 1058
621/*
622 * Causes compile errors if the struct buffer_page gets bigger
623 * than the struct page.
624 */
625extern int ring_buffer_page_too_big(void);
626
627#ifdef CONFIG_HOTPLUG_CPU 1059#ifdef CONFIG_HOTPLUG_CPU
628static int rb_cpu_notify(struct notifier_block *self, 1060static int rb_cpu_notify(struct notifier_block *self,
629 unsigned long action, void *hcpu); 1061 unsigned long action, void *hcpu);
@@ -646,11 +1078,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
646 int bsize; 1078 int bsize;
647 int cpu; 1079 int cpu;
648 1080
649 /* Paranoid! Optimizes out when all is well */
650 if (sizeof(struct buffer_page) > sizeof(struct page))
651 ring_buffer_page_too_big();
652
653
654 /* keep it in its own cache line */ 1081 /* keep it in its own cache line */
655 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1082 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
656 GFP_KERNEL); 1083 GFP_KERNEL);
@@ -666,8 +1093,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
666 buffer->reader_lock_key = key; 1093 buffer->reader_lock_key = key;
667 1094
668 /* need at least two pages */ 1095 /* need at least two pages */
669 if (buffer->pages == 1) 1096 if (buffer->pages < 2)
670 buffer->pages++; 1097 buffer->pages = 2;
671 1098
672 /* 1099 /*
673 * In case of non-hotplug cpu, if the ring-buffer is allocated 1100 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -743,6 +1170,7 @@ ring_buffer_free(struct ring_buffer *buffer)
743 1170
744 put_online_cpus(); 1171 put_online_cpus();
745 1172
1173 kfree(buffer->buffers);
746 free_cpumask_var(buffer->cpumask); 1174 free_cpumask_var(buffer->cpumask);
747 1175
748 kfree(buffer); 1176 kfree(buffer);
@@ -767,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
767 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
768 synchronize_sched(); 1196 synchronize_sched();
769 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
770 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1202 return;
773 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
774 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
775 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
776 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
777 } 1207 }
778 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
779 return; 1209 return;
780 1210
781 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -797,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
797 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
798 synchronize_sched(); 1228 synchronize_sched();
799 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
800 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
801 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
802 return; 1235 return;
803 p = pages->next; 1236 p = pages->next;
804 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
805 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
806 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
807 } 1240 }
808 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
809 1243
810 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
811 1245
@@ -956,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
956} 1390}
957 1391
958static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
959rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
960{
961 return __rb_page_index(cpu_buffer->head_page,
962 cpu_buffer->head_page->read);
963}
964
965static inline struct ring_buffer_event *
966rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
967{ 1394{
968 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
969} 1396}
970 1397
971static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
972{ 1399{
973 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
974} 1401}
975 1402
976static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -978,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
978 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
979} 1406}
980 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
981/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
982static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
983{ 1415{
@@ -990,33 +1422,17 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
990 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
991} 1423}
992 1424
993static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
994{
995 return rb_page_commit(cpu_buffer->head_page);
996}
997
998static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
999 struct buffer_page **bpage)
1000{
1001 struct list_head *p = (*bpage)->list.next;
1002
1003 if (p == &cpu_buffer->pages)
1004 p = p->next;
1005
1006 *bpage = list_entry(p, struct buffer_page, list);
1007}
1008
1009static inline unsigned 1425static inline unsigned
1010rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1011{ 1427{
1012 unsigned long addr = (unsigned long)event; 1428 unsigned long addr = (unsigned long)event;
1013 1429
1014 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1430 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1015} 1431}
1016 1432
1017static inline int 1433static inline int
1018rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1434rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1019 struct ring_buffer_event *event) 1435 struct ring_buffer_event *event)
1020{ 1436{
1021 unsigned long addr = (unsigned long)event; 1437 unsigned long addr = (unsigned long)event;
1022 unsigned long index; 1438 unsigned long index;
@@ -1029,33 +1445,10 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1029} 1445}
1030 1446
1031static void 1447static void
1032rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1033 struct ring_buffer_event *event)
1034{
1035 unsigned long addr = (unsigned long)event;
1036 unsigned long index;
1037
1038 index = rb_event_index(event);
1039 addr &= PAGE_MASK;
1040
1041 while (cpu_buffer->commit_page->page != (void *)addr) {
1042 if (RB_WARN_ON(cpu_buffer,
1043 cpu_buffer->commit_page == cpu_buffer->tail_page))
1044 return;
1045 cpu_buffer->commit_page->page->commit =
1046 cpu_buffer->commit_page->write;
1047 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1048 cpu_buffer->write_stamp =
1049 cpu_buffer->commit_page->page->time_stamp;
1050 }
1051
1052 /* Now set the commit to the event's index */
1053 local_set(&cpu_buffer->commit_page->page->commit, index);
1054}
1055
1056static void
1057rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1058{ 1449{
1450 unsigned long max_count;
1451
1059 /* 1452 /*
1060 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1061 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1065,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1065 * assign the commit to the tail. 1458 * assign the commit to the tail.
1066 */ 1459 */
1067 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1068 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1069 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1070 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1071 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1072 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1073 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1076,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1076 } 1476 }
1077 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1078 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1079 cpu_buffer->commit_page->page->commit = 1479
1080 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1081 barrier(); 1485 barrier();
1082 } 1486 }
1083 1487
@@ -1110,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1110 * to the head page instead of next. 1514 * to the head page instead of next.
1111 */ 1515 */
1112 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1113 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1114 else 1518 else
1115 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1116 1520
@@ -1154,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1154 } 1558 }
1155} 1559}
1156 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1157static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1158{ 1719{
1159 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1171,6 +1732,57 @@ static unsigned rb_calculate_event_length(unsigned length)
1171 return length; 1732 return length;
1172} 1733}
1173 1734
1735static inline void
1736rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1737 struct buffer_page *tail_page,
1738 unsigned long tail, unsigned long length)
1739{
1740 struct ring_buffer_event *event;
1741
1742 /*
1743 * Only the event that crossed the page boundary
1744 * must fill the old tail_page with padding.
1745 */
1746 if (tail >= BUF_PAGE_SIZE) {
1747 local_sub(length, &tail_page->write);
1748 return;
1749 }
1750
1751 event = __rb_page_index(tail_page, tail);
1752 kmemcheck_annotate_bitfield(event, bitfield);
1753
1754 /*
1755 * If this event is bigger than the minimum size, then
1756 * we need to be careful that we don't subtract the
1757 * write counter enough to allow another writer to slip
1758 * in on this page.
1759 * We put in a discarded commit instead, to make sure
1760 * that this space is not used again.
1761 *
1762 * If we are less than the minimum size, we don't need to
1763 * worry about it.
1764 */
1765 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1766 /* No room for any events */
1767
1768 /* Mark the rest of the page with padding */
1769 rb_event_set_padding(event);
1770
1771 /* Set the write back to the previous setting */
1772 local_sub(length, &tail_page->write);
1773 return;
1774 }
1775
1776 /* Put in a discarded event */
1777 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1778 event->type_len = RINGBUF_TYPE_PADDING;
1779 /* time delta must be non zero */
1780 event->time_delta = 1;
1781
1782 /* Set write to end of buffer */
1783 length = (tail + length) - BUF_PAGE_SIZE;
1784 local_sub(length, &tail_page->write);
1785}
1174 1786
1175static struct ring_buffer_event * 1787static struct ring_buffer_event *
1176rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1178,128 +1790,101 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1178 struct buffer_page *commit_page, 1790 struct buffer_page *commit_page,
1179 struct buffer_page *tail_page, u64 *ts) 1791 struct buffer_page *tail_page, u64 *ts)
1180{ 1792{
1181 struct buffer_page *next_page, *head_page, *reader_page;
1182 struct ring_buffer *buffer = cpu_buffer->buffer; 1793 struct ring_buffer *buffer = cpu_buffer->buffer;
1183 struct ring_buffer_event *event; 1794 struct buffer_page *next_page;
1184 bool lock_taken = false; 1795 int ret;
1185 unsigned long flags;
1186 1796
1187 next_page = tail_page; 1797 next_page = tail_page;
1188 1798
1189 local_irq_save(flags);
1190 /*
1191 * Since the write to the buffer is still not
1192 * fully lockless, we must be careful with NMIs.
1193 * The locks in the writers are taken when a write
1194 * crosses to a new page. The locks protect against
1195 * races with the readers (this will soon be fixed
1196 * with a lockless solution).
1197 *
1198 * Because we can not protect against NMIs, and we
1199 * want to keep traces reentrant, we need to manage
1200 * what happens when we are in an NMI.
1201 *
1202 * NMIs can happen after we take the lock.
1203 * If we are in an NMI, only take the lock
1204 * if it is not already taken. Otherwise
1205 * simply fail.
1206 */
1207 if (unlikely(in_nmi())) {
1208 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1209 cpu_buffer->nmi_dropped++;
1210 goto out_reset;
1211 }
1212 } else
1213 __raw_spin_lock(&cpu_buffer->lock);
1214
1215 lock_taken = true;
1216
1217 rb_inc_page(cpu_buffer, &next_page); 1799 rb_inc_page(cpu_buffer, &next_page);
1218 1800
1219 head_page = cpu_buffer->head_page;
1220 reader_page = cpu_buffer->reader_page;
1221
1222 /* we grabbed the lock before incrementing */
1223 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1224 goto out_reset;
1225
1226 /* 1801 /*
1227 * If for some reason, we had an interrupt storm that made 1802 * If for some reason, we had an interrupt storm that made
1228 * it all the way around the buffer, bail, and warn 1803 * it all the way around the buffer, bail, and warn
1229 * about it. 1804 * about it.
1230 */ 1805 */
1231 if (unlikely(next_page == commit_page)) { 1806 if (unlikely(next_page == commit_page)) {
1232 cpu_buffer->commit_overrun++; 1807 local_inc(&cpu_buffer->commit_overrun);
1233 goto out_reset; 1808 goto out_reset;
1234 } 1809 }
1235 1810
1236 if (next_page == head_page) {
1237 if (!(buffer->flags & RB_FL_OVERWRITE))
1238 goto out_reset;
1239
1240 /* tail_page has not moved yet? */
1241 if (tail_page == cpu_buffer->tail_page) {
1242 /* count overflows */
1243 cpu_buffer->overrun +=
1244 local_read(&head_page->entries);
1245
1246 rb_inc_page(cpu_buffer, &head_page);
1247 cpu_buffer->head_page = head_page;
1248 cpu_buffer->head_page->read = 0;
1249 }
1250 }
1251
1252 /* 1811 /*
1253 * If the tail page is still the same as what we think 1812 * This is where the fun begins!
1254 * it is, then it is up to us to update the tail 1813 *
1255 * pointer. 1814 * We are fighting against races between a reader that
1815 * could be on another CPU trying to swap its reader
1816 * page with the buffer head.
1817 *
1818 * We are also fighting against interrupts coming in and
1819 * moving the head or tail on us as well.
1820 *
1821 * If the next page is the head page then we have filled
1822 * the buffer, unless the commit page is still on the
1823 * reader page.
1256 */ 1824 */
1257 if (tail_page == cpu_buffer->tail_page) { 1825 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1258 local_set(&next_page->write, 0);
1259 local_set(&next_page->entries, 0);
1260 local_set(&next_page->page->commit, 0);
1261 cpu_buffer->tail_page = next_page;
1262 1826
1263 /* reread the time stamp */ 1827 /*
1264 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1828 * If the commit is not on the reader page, then
1265 cpu_buffer->tail_page->page->time_stamp = *ts; 1829 * move the header page.
1830 */
1831 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832 /*
1833 * If we are not in overwrite mode,
1834 * this is easy, just stop here.
1835 */
1836 if (!(buffer->flags & RB_FL_OVERWRITE))
1837 goto out_reset;
1838
1839 ret = rb_handle_head_page(cpu_buffer,
1840 tail_page,
1841 next_page);
1842 if (ret < 0)
1843 goto out_reset;
1844 if (ret)
1845 goto out_again;
1846 } else {
1847 /*
1848 * We need to be careful here too. The
1849 * commit page could still be on the reader
1850 * page. We could have a small buffer, and
1851 * have filled up the buffer with events
1852 * from interrupts and such, and wrapped.
1853 *
1854 * Note, if the tail page is also the on the
1855 * reader_page, we let it move out.
1856 */
1857 if (unlikely((cpu_buffer->commit_page !=
1858 cpu_buffer->tail_page) &&
1859 (cpu_buffer->commit_page ==
1860 cpu_buffer->reader_page))) {
1861 local_inc(&cpu_buffer->commit_overrun);
1862 goto out_reset;
1863 }
1864 }
1266 } 1865 }
1267 1866
1268 /* 1867 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1269 * The actual tail page has moved forward. 1868 if (ret) {
1270 */ 1869 /*
1271 if (tail < BUF_PAGE_SIZE) { 1870 * Nested commits always have zero deltas, so
1272 /* Mark the rest of the page with padding */ 1871 * just reread the time stamp
1273 event = __rb_page_index(tail_page, tail); 1872 */
1274 kmemcheck_annotate_bitfield(event, bitfield); 1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1275 rb_event_set_padding(event); 1874 next_page->page->time_stamp = *ts;
1276 } 1875 }
1277 1876
1278 /* Set the write back to the previous setting */ 1877 out_again:
1279 local_sub(length, &tail_page->write);
1280 1878
1281 /* 1879 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1282 * If this was a commit entry that failed,
1283 * increment that too
1284 */
1285 if (tail_page == cpu_buffer->commit_page &&
1286 tail == rb_commit_index(cpu_buffer)) {
1287 rb_set_commit_to_write(cpu_buffer);
1288 }
1289
1290 __raw_spin_unlock(&cpu_buffer->lock);
1291 local_irq_restore(flags);
1292 1880
1293 /* fail and let the caller try again */ 1881 /* fail and let the caller try again */
1294 return ERR_PTR(-EAGAIN); 1882 return ERR_PTR(-EAGAIN);
1295 1883
1296 out_reset: 1884 out_reset:
1297 /* reset write */ 1885 /* reset write */
1298 local_sub(length, &tail_page->write); 1886 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299 1887
1300 if (likely(lock_taken))
1301 __raw_spin_unlock(&cpu_buffer->lock);
1302 local_irq_restore(flags);
1303 return NULL; 1888 return NULL;
1304} 1889}
1305 1890
@@ -1316,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1316 barrier(); 1901 barrier();
1317 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1318 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904
1905 /* set write to only the index of the write */
1906 write &= RB_WRITE_MASK;
1319 tail = write - length; 1907 tail = write - length;
1320 1908
1321 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
@@ -1325,9 +1913,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1325 1913
1326 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1327 1915
1328 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1329 return NULL;
1330
1331 event = __rb_page_index(tail_page, tail); 1916 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield); 1917 kmemcheck_annotate_bitfield(event, bitfield);
1333 rb_update_event(event, type, length); 1918 rb_update_event(event, type, length);
@@ -1337,11 +1922,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1337 local_inc(&tail_page->entries); 1922 local_inc(&tail_page->entries);
1338 1923
1339 /* 1924 /*
1340 * If this is a commit and the tail is zero, then update 1925 * If this is the first commit on the page, then update
1341 * this page's time stamp. 1926 * its timestamp.
1342 */ 1927 */
1343 if (!tail && rb_is_commit(cpu_buffer, event)) 1928 if (!tail)
1344 cpu_buffer->commit_page->page->time_stamp = *ts; 1929 tail_page->page->time_stamp = *ts;
1345 1930
1346 return event; 1931 return event;
1347} 1932}
@@ -1363,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1363 bpage = cpu_buffer->tail_page; 1948 bpage = cpu_buffer->tail_page;
1364 1949
1365 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1950 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951 unsigned long write_mask =
1952 local_read(&bpage->write) & ~RB_WRITE_MASK;
1366 /* 1953 /*
1367 * This is on the tail page. It is possible that 1954 * This is on the tail page. It is possible that
1368 * a write could come in and move the tail page 1955 * a write could come in and move the tail page
1369 * and write to the next page. That is fine 1956 * and write to the next page. That is fine
1370 * because we just shorten what is on this page. 1957 * because we just shorten what is on this page.
1371 */ 1958 */
1959 old_index += write_mask;
1960 new_index += write_mask;
1372 index = local_cmpxchg(&bpage->write, old_index, new_index); 1961 index = local_cmpxchg(&bpage->write, old_index, new_index);
1373 if (index == old_index) 1962 if (index == old_index)
1374 return 1; 1963 return 1;
@@ -1410,16 +1999,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1410 return -EAGAIN; 1999 return -EAGAIN;
1411 2000
1412 /* Only a commited time event can update the write stamp */ 2001 /* Only a commited time event can update the write stamp */
1413 if (rb_is_commit(cpu_buffer, event)) { 2002 if (rb_event_is_commit(cpu_buffer, event)) {
1414 /* 2003 /*
1415 * If this is the first on the page, then we need to 2004 * If this is the first on the page, then it was
1416 * update the page itself, and just put in a zero. 2005 * updated with the page itself. Try to discard it
2006 * and if we can't just make it zero.
1417 */ 2007 */
1418 if (rb_event_index(event)) { 2008 if (rb_event_index(event)) {
1419 event->time_delta = *delta & TS_MASK; 2009 event->time_delta = *delta & TS_MASK;
1420 event->array[0] = *delta >> TS_SHIFT; 2010 event->array[0] = *delta >> TS_SHIFT;
1421 } else { 2011 } else {
1422 cpu_buffer->commit_page->page->time_stamp = *ts;
1423 /* try to discard, since we do not need this */ 2012 /* try to discard, since we do not need this */
1424 if (!rb_try_to_discard(cpu_buffer, event)) { 2013 if (!rb_try_to_discard(cpu_buffer, event)) {
1425 /* nope, just zero it */ 2014 /* nope, just zero it */
@@ -1445,8 +2034,47 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1445 return ret; 2034 return ret;
1446} 2035}
1447 2036
2037static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2038{
2039 local_inc(&cpu_buffer->committing);
2040 local_inc(&cpu_buffer->commits);
2041}
2042
2043static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2044{
2045 unsigned long commits;
2046
2047 if (RB_WARN_ON(cpu_buffer,
2048 !local_read(&cpu_buffer->committing)))
2049 return;
2050
2051 again:
2052 commits = local_read(&cpu_buffer->commits);
2053 /* synchronize with interrupts */
2054 barrier();
2055 if (local_read(&cpu_buffer->committing) == 1)
2056 rb_set_commit_to_write(cpu_buffer);
2057
2058 local_dec(&cpu_buffer->committing);
2059
2060 /* synchronize with interrupts */
2061 barrier();
2062
2063 /*
2064 * Need to account for interrupts coming in between the
2065 * updating of the commit page and the clearing of the
2066 * committing counter.
2067 */
2068 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
2069 !local_read(&cpu_buffer->committing)) {
2070 local_inc(&cpu_buffer->committing);
2071 goto again;
2072 }
2073}
2074
1448static struct ring_buffer_event * 2075static struct ring_buffer_event *
1449rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2076rb_reserve_next_event(struct ring_buffer *buffer,
2077 struct ring_buffer_per_cpu *cpu_buffer,
1450 unsigned long length) 2078 unsigned long length)
1451{ 2079{
1452 struct ring_buffer_event *event; 2080 struct ring_buffer_event *event;
@@ -1454,6 +2082,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1454 int commit = 0; 2082 int commit = 0;
1455 int nr_loops = 0; 2083 int nr_loops = 0;
1456 2084
2085 rb_start_commit(cpu_buffer);
2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088 /*
2089 * Due to the ability to swap a cpu buffer from a buffer
2090 * it is possible it was swapped before we committed.
2091 * (committing stops a swap). We check for it here and
2092 * if it happened, we have to fail the write.
2093 */
2094 barrier();
2095 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096 local_dec(&cpu_buffer->committing);
2097 local_dec(&cpu_buffer->commits);
2098 return NULL;
2099 }
2100#endif
2101
1457 length = rb_calculate_event_length(length); 2102 length = rb_calculate_event_length(length);
1458 again: 2103 again:
1459 /* 2104 /*
@@ -1466,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1466 * Bail! 2111 * Bail!
1467 */ 2112 */
1468 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1469 return NULL; 2114 goto out_fail;
1470 2115
1471 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2116 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1472 2117
@@ -1497,7 +2142,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1497 2142
1498 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2143 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1499 if (commit == -EBUSY) 2144 if (commit == -EBUSY)
1500 return NULL; 2145 goto out_fail;
1501 2146
1502 if (commit == -EAGAIN) 2147 if (commit == -EAGAIN)
1503 goto again; 2148 goto again;
@@ -1511,30 +2156,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1511 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2156 if (unlikely(PTR_ERR(event) == -EAGAIN))
1512 goto again; 2157 goto again;
1513 2158
1514 if (!event) { 2159 if (!event)
1515 if (unlikely(commit)) 2160 goto out_fail;
1516 /*
1517 * Ouch! We needed a timestamp and it was commited. But
1518 * we didn't get our event reserved.
1519 */
1520 rb_set_commit_to_write(cpu_buffer);
1521 return NULL;
1522 }
1523 2161
1524 /* 2162 if (!rb_event_is_commit(cpu_buffer, event))
1525 * If the timestamp was commited, make the commit our entry
1526 * now so that we will update it when needed.
1527 */
1528 if (unlikely(commit))
1529 rb_set_commit_event(cpu_buffer, event);
1530 else if (!rb_is_commit(cpu_buffer, event))
1531 delta = 0; 2163 delta = 0;
1532 2164
1533 event->time_delta = delta; 2165 event->time_delta = delta;
1534 2166
1535 return event; 2167 return event;
2168
2169 out_fail:
2170 rb_end_commit(cpu_buffer);
2171 return NULL;
1536} 2172}
1537 2173
2174#ifdef CONFIG_TRACING
2175
1538#define TRACE_RECURSIVE_DEPTH 16 2176#define TRACE_RECURSIVE_DEPTH 16
1539 2177
1540static int trace_recursive_lock(void) 2178static int trace_recursive_lock(void)
@@ -1565,6 +2203,13 @@ static void trace_recursive_unlock(void)
1565 current->trace_recursion--; 2203 current->trace_recursion--;
1566} 2204}
1567 2205
2206#else
2207
2208#define trace_recursive_lock() (0)
2209#define trace_recursive_unlock() do { } while (0)
2210
2211#endif
2212
1568static DEFINE_PER_CPU(int, rb_need_resched); 2213static DEFINE_PER_CPU(int, rb_need_resched);
1569 2214
1570/** 2215/**
@@ -1614,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1614 if (length > BUF_MAX_DATA_SIZE) 2259 if (length > BUF_MAX_DATA_SIZE)
1615 goto out; 2260 goto out;
1616 2261
1617 event = rb_reserve_next_event(cpu_buffer, length); 2262 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1618 if (!event) 2263 if (!event)
1619 goto out; 2264 goto out;
1620 2265
@@ -1637,18 +2282,24 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1637} 2282}
1638EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1639 2284
2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2287 struct ring_buffer_event *event)
2288{
2289 /*
2290 * The event first in the commit queue updates the
2291 * time stamp.
2292 */
2293 if (rb_event_is_commit(cpu_buffer, event))
2294 cpu_buffer->write_stamp += event->time_delta;
2295}
2296
1640static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1641 struct ring_buffer_event *event) 2298 struct ring_buffer_event *event)
1642{ 2299{
1643 local_inc(&cpu_buffer->entries); 2300 local_inc(&cpu_buffer->entries);
1644 2301 rb_update_write_stamp(cpu_buffer, event);
1645 /* Only process further if we own the commit */ 2302 rb_end_commit(cpu_buffer);
1646 if (!rb_is_commit(cpu_buffer, event))
1647 return;
1648
1649 cpu_buffer->write_stamp += event->time_delta;
1650
1651 rb_set_commit_to_write(cpu_buffer);
1652} 2303}
1653 2304
1654/** 2305/**
@@ -1694,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1694 event->time_delta = 1; 2345 event->time_delta = 1;
1695} 2346}
1696 2347
1697/** 2348/*
1698 * ring_buffer_event_discard - discard any event in the ring buffer 2349 * Decrement the entries to the page that an event is on.
1699 * @event: the event to discard 2350 * The event does not even need to exist, only the pointer
1700 * 2351 * to the page it is on. This may only be called before the commit
1701 * Sometimes a event that is in the ring buffer needs to be ignored. 2352 * takes place.
1702 * This function lets the user discard an event in the ring buffer
1703 * and then that event will not be read later.
1704 *
1705 * Note, it is up to the user to be careful with this, and protect
1706 * against races. If the user discards an event that has been consumed
1707 * it is possible that it could corrupt the ring buffer.
1708 */ 2353 */
1709void ring_buffer_event_discard(struct ring_buffer_event *event) 2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356 struct ring_buffer_event *event)
1710{ 2357{
1711 rb_event_discard(event); 2358 unsigned long addr = (unsigned long)event;
2359 struct buffer_page *bpage = cpu_buffer->commit_page;
2360 struct buffer_page *start;
2361
2362 addr &= PAGE_MASK;
2363
2364 /* Do the likely case first */
2365 if (likely(bpage->page == (void *)addr)) {
2366 local_dec(&bpage->entries);
2367 return;
2368 }
2369
2370 /*
2371 * Because the commit page may be on the reader page we
2372 * start with the next page and check the end loop there.
2373 */
2374 rb_inc_page(cpu_buffer, &bpage);
2375 start = bpage;
2376 do {
2377 if (bpage->page == (void *)addr) {
2378 local_dec(&bpage->entries);
2379 return;
2380 }
2381 rb_inc_page(cpu_buffer, &bpage);
2382 } while (bpage != start);
2383
2384 /* commit not part of this buffer?? */
2385 RB_WARN_ON(cpu_buffer, 1);
1712} 2386}
1713EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1714 2387
1715/** 2388/**
1716 * ring_buffer_commit_discard - discard an event that has not been committed 2389 * ring_buffer_commit_discard - discard an event that has not been committed
1717 * @buffer: the ring buffer 2390 * @buffer: the ring buffer
1718 * @event: non committed event to discard 2391 * @event: non committed event to discard
1719 * 2392 *
1720 * This is similar to ring_buffer_event_discard but must only be 2393 * Sometimes an event that is in the ring buffer needs to be ignored.
1721 * performed on an event that has not been committed yet. The difference 2394 * This function lets the user discard an event in the ring buffer
1722 * is that this will also try to free the event from the ring buffer 2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
1723 * if another event has not been added behind it. 2399 * if another event has not been added behind it.
1724 * 2400 *
1725 * If another event has been added behind it, it will set the event 2401 * If another event has been added behind it, it will set the event
@@ -1737,32 +2413,27 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1737 /* The event is discarded regardless */ 2413 /* The event is discarded regardless */
1738 rb_event_discard(event); 2414 rb_event_discard(event);
1739 2415
2416 cpu = smp_processor_id();
2417 cpu_buffer = buffer->buffers[cpu];
2418
1740 /* 2419 /*
1741 * This must only be called if the event has not been 2420 * This must only be called if the event has not been
1742 * committed yet. Thus we can assume that preemption 2421 * committed yet. Thus we can assume that preemption
1743 * is still disabled. 2422 * is still disabled.
1744 */ 2423 */
1745 RB_WARN_ON(buffer, preemptible()); 2424 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1746 2425
1747 cpu = smp_processor_id(); 2426 rb_decrement_entry(cpu_buffer, event);
1748 cpu_buffer = buffer->buffers[cpu]; 2427 if (rb_try_to_discard(cpu_buffer, event))
1749
1750 if (!rb_try_to_discard(cpu_buffer, event))
1751 goto out; 2428 goto out;
1752 2429
1753 /* 2430 /*
1754 * The commit is still visible by the reader, so we 2431 * The commit is still visible by the reader, so we
1755 * must increment entries. 2432 * must still update the timestamp.
1756 */ 2433 */
1757 local_inc(&cpu_buffer->entries); 2434 rb_update_write_stamp(cpu_buffer, event);
1758 out: 2435 out:
1759 /* 2436 rb_end_commit(cpu_buffer);
1760 * If a write came in and pushed the tail page
1761 * we still need to update the commit pointer
1762 * if we were the commit.
1763 */
1764 if (rb_is_commit(cpu_buffer, event))
1765 rb_set_commit_to_write(cpu_buffer);
1766 2437
1767 trace_recursive_unlock(); 2438 trace_recursive_unlock();
1768 2439
@@ -1821,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1821 if (length > BUF_MAX_DATA_SIZE) 2492 if (length > BUF_MAX_DATA_SIZE)
1822 goto out; 2493 goto out;
1823 2494
1824 event = rb_reserve_next_event(cpu_buffer, length); 2495 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1825 if (!event) 2496 if (!event)
1826 goto out; 2497 goto out;
1827 2498
@@ -1842,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1842static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1843{ 2514{
1844 struct buffer_page *reader = cpu_buffer->reader_page; 2515 struct buffer_page *reader = cpu_buffer->reader_page;
1845 struct buffer_page *head = cpu_buffer->head_page; 2516 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1846 struct buffer_page *commit = cpu_buffer->commit_page; 2517 struct buffer_page *commit = cpu_buffer->commit_page;
1847 2518
2519 /* In case of error, head will be NULL */
2520 if (unlikely(!head))
2521 return 1;
2522
1848 return reader->read == rb_page_commit(reader) && 2523 return reader->read == rb_page_commit(reader) &&
1849 (commit == reader || 2524 (commit == reader ||
1850 (commit == head && 2525 (commit == head &&
@@ -1935,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1935 return 0; 2610 return 0;
1936 2611
1937 cpu_buffer = buffer->buffers[cpu]; 2612 cpu_buffer = buffer->buffers[cpu];
1938 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2613 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1939 - cpu_buffer->read; 2614 - cpu_buffer->read;
1940 2615
1941 return ret; 2616 return ret;
@@ -1956,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1956 return 0; 2631 return 0;
1957 2632
1958 cpu_buffer = buffer->buffers[cpu]; 2633 cpu_buffer = buffer->buffers[cpu];
1959 ret = cpu_buffer->overrun; 2634 ret = local_read(&cpu_buffer->overrun);
1960 2635
1961 return ret; 2636 return ret;
1962} 2637}
1963EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1964 2639
1965/** 2640/**
1966 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1967 * @buffer: The ring buffer
1968 * @cpu: The per CPU buffer to get the number of overruns from
1969 */
1970unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
1971{
1972 struct ring_buffer_per_cpu *cpu_buffer;
1973 unsigned long ret;
1974
1975 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1976 return 0;
1977
1978 cpu_buffer = buffer->buffers[cpu];
1979 ret = cpu_buffer->nmi_dropped;
1980
1981 return ret;
1982}
1983EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
1984
1985/**
1986 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
1987 * @buffer: The ring buffer 2642 * @buffer: The ring buffer
1988 * @cpu: The per CPU buffer to get the number of overruns from 2643 * @cpu: The per CPU buffer to get the number of overruns from
@@ -1997,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
1997 return 0; 2652 return 0;
1998 2653
1999 cpu_buffer = buffer->buffers[cpu]; 2654 cpu_buffer = buffer->buffers[cpu];
2000 ret = cpu_buffer->commit_overrun; 2655 ret = local_read(&cpu_buffer->commit_overrun);
2001 2656
2002 return ret; 2657 return ret;
2003} 2658}
@@ -2020,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2020 for_each_buffer_cpu(buffer, cpu) { 2675 for_each_buffer_cpu(buffer, cpu) {
2021 cpu_buffer = buffer->buffers[cpu]; 2676 cpu_buffer = buffer->buffers[cpu];
2022 entries += (local_read(&cpu_buffer->entries) - 2677 entries += (local_read(&cpu_buffer->entries) -
2023 cpu_buffer->overrun) - cpu_buffer->read; 2678 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2024 } 2679 }
2025 2680
2026 return entries; 2681 return entries;
@@ -2043,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2043 /* if you care about this being correct, lock the buffer */ 2698 /* if you care about this being correct, lock the buffer */
2044 for_each_buffer_cpu(buffer, cpu) { 2699 for_each_buffer_cpu(buffer, cpu) {
2045 cpu_buffer = buffer->buffers[cpu]; 2700 cpu_buffer = buffer->buffers[cpu];
2046 overruns += cpu_buffer->overrun; 2701 overruns += local_read(&cpu_buffer->overrun);
2047 } 2702 }
2048 2703
2049 return overruns; 2704 return overruns;
@@ -2056,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2056 2711
2057 /* Iterator usage is expected to have record disabled */ 2712 /* Iterator usage is expected to have record disabled */
2058 if (list_empty(&cpu_buffer->reader_page->list)) { 2713 if (list_empty(&cpu_buffer->reader_page->list)) {
2059 iter->head_page = cpu_buffer->head_page; 2714 iter->head_page = rb_set_head_page(cpu_buffer);
2060 iter->head = cpu_buffer->head_page->read; 2715 if (unlikely(!iter->head_page))
2716 return;
2717 iter->head = iter->head_page->read;
2061 } else { 2718 } else {
2062 iter->head_page = cpu_buffer->reader_page; 2719 iter->head_page = cpu_buffer->reader_page;
2063 iter->head = cpu_buffer->reader_page->read; 2720 iter->head = cpu_buffer->reader_page->read;
@@ -2174,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2174 struct buffer_page *reader = NULL; 2831 struct buffer_page *reader = NULL;
2175 unsigned long flags; 2832 unsigned long flags;
2176 int nr_loops = 0; 2833 int nr_loops = 0;
2834 int ret;
2177 2835
2178 local_irq_save(flags); 2836 local_irq_save(flags);
2179 __raw_spin_lock(&cpu_buffer->lock); 2837 __raw_spin_lock(&cpu_buffer->lock);
@@ -2207,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 goto out; 2865 goto out;
2208 2866
2209 /* 2867 /*
2210 * Splice the empty reader page into the list around the head.
2211 * Reset the reader page to size zero. 2868 * Reset the reader page to size zero.
2212 */ 2869 */
2870 local_set(&cpu_buffer->reader_page->write, 0);
2871 local_set(&cpu_buffer->reader_page->entries, 0);
2872 local_set(&cpu_buffer->reader_page->page->commit, 0);
2213 2873
2214 reader = cpu_buffer->head_page; 2874 spin:
2875 /*
2876 * Splice the empty reader page into the list around the head.
2877 */
2878 reader = rb_set_head_page(cpu_buffer);
2215 cpu_buffer->reader_page->list.next = reader->list.next; 2879 cpu_buffer->reader_page->list.next = reader->list.next;
2216 cpu_buffer->reader_page->list.prev = reader->list.prev; 2880 cpu_buffer->reader_page->list.prev = reader->list.prev;
2217 2881
2218 local_set(&cpu_buffer->reader_page->write, 0); 2882 /*
2219 local_set(&cpu_buffer->reader_page->entries, 0); 2883 * cpu_buffer->pages just needs to point to the buffer, it
2220 local_set(&cpu_buffer->reader_page->page->commit, 0); 2884 * has no specific buffer page to point to. Lets move it out
2885 * of our way so we don't accidently swap it.
2886 */
2887 cpu_buffer->pages = reader->list.prev;
2221 2888
2222 /* Make the reader page now replace the head */ 2889 /* The reader page will be pointing to the new head */
2223 reader->list.prev->next = &cpu_buffer->reader_page->list; 2890 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2224 reader->list.next->prev = &cpu_buffer->reader_page->list; 2891
2892 /*
2893 * Here's the tricky part.
2894 *
2895 * We need to move the pointer past the header page.
2896 * But we can only do that if a writer is not currently
2897 * moving it. The page before the header page has the
2898 * flag bit '1' set if it is pointing to the page we want.
2899 * but if the writer is in the process of moving it
2900 * than it will be '2' or already moved '0'.
2901 */
2902
2903 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2225 2904
2226 /* 2905 /*
2227 * If the tail is on the reader, then we must set the head 2906 * If we did not convert it, then we must try again.
2228 * to the inserted page, otherwise we set it one before.
2229 */ 2907 */
2230 cpu_buffer->head_page = cpu_buffer->reader_page; 2908 if (!ret)
2909 goto spin;
2231 2910
2232 if (cpu_buffer->commit_page != reader) 2911 /*
2233 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2912 * Yeah! We succeeded in replacing the page.
2913 *
2914 * Now make the new head point back to the reader page.
2915 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2234 2918
2235 /* Finally update the reader page to the new head */ 2919 /* Finally update the reader page to the new head */
2236 cpu_buffer->reader_page = reader; 2920 cpu_buffer->reader_page = reader;
@@ -2259,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2259 2943
2260 event = rb_reader_event(cpu_buffer); 2944 event = rb_reader_event(cpu_buffer);
2261 2945
2262 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2946 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2263 || rb_discarded_event(event))
2264 cpu_buffer->read++; 2947 cpu_buffer->read++;
2265 2948
2266 rb_update_read_stamp(cpu_buffer, event); 2949 rb_update_read_stamp(cpu_buffer, event);
@@ -2351,7 +3034,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2351 * the box. Return the padding, and we will release 3034 * the box. Return the padding, and we will release
2352 * the current locks, and try again. 3035 * the current locks, and try again.
2353 */ 3036 */
2354 rb_advance_reader(cpu_buffer);
2355 return event; 3037 return event;
2356 3038
2357 case RINGBUF_TYPE_TIME_EXTEND: 3039 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2446,6 +3128,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2446} 3128}
2447EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 3129EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2448 3130
3131static inline int rb_ok_to_lock(void)
3132{
3133 /*
3134 * If an NMI die dumps out the content of the ring buffer
3135 * do not grab locks. We also permanently disable the ring
3136 * buffer too. A one time deal is all you get from reading
3137 * the ring buffer from an NMI.
3138 */
3139 if (likely(!in_nmi()))
3140 return 1;
3141
3142 tracing_off_permanent();
3143 return 0;
3144}
3145
2449/** 3146/**
2450 * ring_buffer_peek - peek at the next event to be read 3147 * ring_buffer_peek - peek at the next event to be read
2451 * @buffer: The ring buffer to read 3148 * @buffer: The ring buffer to read
@@ -2461,19 +3158,25 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2461 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3158 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2462 struct ring_buffer_event *event; 3159 struct ring_buffer_event *event;
2463 unsigned long flags; 3160 unsigned long flags;
3161 int dolock;
2464 3162
2465 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3163 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2466 return NULL; 3164 return NULL;
2467 3165
3166 dolock = rb_ok_to_lock();
2468 again: 3167 again:
2469 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3168 local_irq_save(flags);
3169 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock);
2470 event = rb_buffer_peek(buffer, cpu, ts); 3171 event = rb_buffer_peek(buffer, cpu, ts);
2471 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3172 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer);
3174 if (dolock)
3175 spin_unlock(&cpu_buffer->reader_lock);
3176 local_irq_restore(flags);
2472 3177
2473 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3178 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2474 cpu_relax();
2475 goto again; 3179 goto again;
2476 }
2477 3180
2478 return event; 3181 return event;
2479} 3182}
@@ -2498,10 +3201,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2498 event = rb_iter_peek(iter, ts); 3201 event = rb_iter_peek(iter, ts);
2499 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3202 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2500 3203
2501 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3204 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2502 cpu_relax();
2503 goto again; 3205 goto again;
2504 }
2505 3206
2506 return event; 3207 return event;
2507} 3208}
@@ -2520,6 +3221,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2520 struct ring_buffer_per_cpu *cpu_buffer; 3221 struct ring_buffer_per_cpu *cpu_buffer;
2521 struct ring_buffer_event *event = NULL; 3222 struct ring_buffer_event *event = NULL;
2522 unsigned long flags; 3223 unsigned long flags;
3224 int dolock;
3225
3226 dolock = rb_ok_to_lock();
2523 3227
2524 again: 3228 again:
2525 /* might be called in atomic */ 3229 /* might be called in atomic */
@@ -2529,24 +3233,23 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2529 goto out; 3233 goto out;
2530 3234
2531 cpu_buffer = buffer->buffers[cpu]; 3235 cpu_buffer = buffer->buffers[cpu];
2532 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3236 local_irq_save(flags);
3237 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock);
2533 3239
2534 event = rb_buffer_peek(buffer, cpu, ts); 3240 event = rb_buffer_peek(buffer, cpu, ts);
2535 if (!event) 3241 if (event)
2536 goto out_unlock; 3242 rb_advance_reader(cpu_buffer);
2537
2538 rb_advance_reader(cpu_buffer);
2539 3243
2540 out_unlock: 3244 if (dolock)
2541 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3245 spin_unlock(&cpu_buffer->reader_lock);
3246 local_irq_restore(flags);
2542 3247
2543 out: 3248 out:
2544 preempt_enable(); 3249 preempt_enable();
2545 3250
2546 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3251 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2547 cpu_relax();
2548 goto again; 3252 goto again;
2549 }
2550 3253
2551 return event; 3254 return event;
2552} 3255}
@@ -2626,21 +3329,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2626 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3329 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2627 unsigned long flags; 3330 unsigned long flags;
2628 3331
2629 again:
2630 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3332 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3333 again:
2631 event = rb_iter_peek(iter, ts); 3334 event = rb_iter_peek(iter, ts);
2632 if (!event) 3335 if (!event)
2633 goto out; 3336 goto out;
2634 3337
3338 if (event->type_len == RINGBUF_TYPE_PADDING)
3339 goto again;
3340
2635 rb_advance_iter(iter); 3341 rb_advance_iter(iter);
2636 out: 3342 out:
2637 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3343 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2638 3344
2639 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2640 cpu_relax();
2641 goto again;
2642 }
2643
2644 return event; 3345 return event;
2645} 3346}
2646EXPORT_SYMBOL_GPL(ring_buffer_read); 3347EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2658,8 +3359,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2658static void 3359static void
2659rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3360rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2660{ 3361{
3362 rb_head_page_deactivate(cpu_buffer);
3363
2661 cpu_buffer->head_page 3364 cpu_buffer->head_page
2662 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3365 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2663 local_set(&cpu_buffer->head_page->write, 0); 3366 local_set(&cpu_buffer->head_page->write, 0);
2664 local_set(&cpu_buffer->head_page->entries, 0); 3367 local_set(&cpu_buffer->head_page->entries, 0);
2665 local_set(&cpu_buffer->head_page->page->commit, 0); 3368 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2675,14 +3378,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2675 local_set(&cpu_buffer->reader_page->page->commit, 0); 3378 local_set(&cpu_buffer->reader_page->page->commit, 0);
2676 cpu_buffer->reader_page->read = 0; 3379 cpu_buffer->reader_page->read = 0;
2677 3380
2678 cpu_buffer->nmi_dropped = 0; 3381 local_set(&cpu_buffer->commit_overrun, 0);
2679 cpu_buffer->commit_overrun = 0; 3382 local_set(&cpu_buffer->overrun, 0);
2680 cpu_buffer->overrun = 0;
2681 cpu_buffer->read = 0;
2682 local_set(&cpu_buffer->entries, 0); 3383 local_set(&cpu_buffer->entries, 0);
3384 local_set(&cpu_buffer->committing, 0);
3385 local_set(&cpu_buffer->commits, 0);
3386 cpu_buffer->read = 0;
2683 3387
2684 cpu_buffer->write_stamp = 0; 3388 cpu_buffer->write_stamp = 0;
2685 cpu_buffer->read_stamp = 0; 3389 cpu_buffer->read_stamp = 0;
3390
3391 rb_head_page_activate(cpu_buffer);
2686} 3392}
2687 3393
2688/** 3394/**
@@ -2702,12 +3408,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2702 3408
2703 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2704 3410
3411 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3412 goto out;
3413
2705 __raw_spin_lock(&cpu_buffer->lock); 3414 __raw_spin_lock(&cpu_buffer->lock);
2706 3415
2707 rb_reset_cpu(cpu_buffer); 3416 rb_reset_cpu(cpu_buffer);
2708 3417
2709 __raw_spin_unlock(&cpu_buffer->lock); 3418 __raw_spin_unlock(&cpu_buffer->lock);
2710 3419
3420 out:
2711 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2712 3422
2713 atomic_dec(&cpu_buffer->record_disabled); 3423 atomic_dec(&cpu_buffer->record_disabled);
@@ -2734,12 +3444,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2734int ring_buffer_empty(struct ring_buffer *buffer) 3444int ring_buffer_empty(struct ring_buffer *buffer)
2735{ 3445{
2736 struct ring_buffer_per_cpu *cpu_buffer; 3446 struct ring_buffer_per_cpu *cpu_buffer;
3447 unsigned long flags;
3448 int dolock;
2737 int cpu; 3449 int cpu;
3450 int ret;
3451
3452 dolock = rb_ok_to_lock();
2738 3453
2739 /* yes this is racy, but if you don't like the race, lock the buffer */ 3454 /* yes this is racy, but if you don't like the race, lock the buffer */
2740 for_each_buffer_cpu(buffer, cpu) { 3455 for_each_buffer_cpu(buffer, cpu) {
2741 cpu_buffer = buffer->buffers[cpu]; 3456 cpu_buffer = buffer->buffers[cpu];
2742 if (!rb_per_cpu_empty(cpu_buffer)) 3457 local_irq_save(flags);
3458 if (dolock)
3459 spin_lock(&cpu_buffer->reader_lock);
3460 ret = rb_per_cpu_empty(cpu_buffer);
3461 if (dolock)
3462 spin_unlock(&cpu_buffer->reader_lock);
3463 local_irq_restore(flags);
3464
3465 if (!ret)
2743 return 0; 3466 return 0;
2744 } 3467 }
2745 3468
@@ -2755,19 +3478,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2755int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 3478int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2756{ 3479{
2757 struct ring_buffer_per_cpu *cpu_buffer; 3480 struct ring_buffer_per_cpu *cpu_buffer;
3481 unsigned long flags;
3482 int dolock;
2758 int ret; 3483 int ret;
2759 3484
2760 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3485 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2761 return 1; 3486 return 1;
2762 3487
3488 dolock = rb_ok_to_lock();
3489
2763 cpu_buffer = buffer->buffers[cpu]; 3490 cpu_buffer = buffer->buffers[cpu];
3491 local_irq_save(flags);
3492 if (dolock)
3493 spin_lock(&cpu_buffer->reader_lock);
2764 ret = rb_per_cpu_empty(cpu_buffer); 3494 ret = rb_per_cpu_empty(cpu_buffer);
2765 3495 if (dolock)
3496 spin_unlock(&cpu_buffer->reader_lock);
3497 local_irq_restore(flags);
2766 3498
2767 return ret; 3499 return ret;
2768} 3500}
2769EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3501EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2770 3502
3503#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2771/** 3504/**
2772 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3505 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2773 * @buffer_a: One buffer to swap with 3506 * @buffer_a: One buffer to swap with
@@ -2822,20 +3555,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2822 atomic_inc(&cpu_buffer_a->record_disabled); 3555 atomic_inc(&cpu_buffer_a->record_disabled);
2823 atomic_inc(&cpu_buffer_b->record_disabled); 3556 atomic_inc(&cpu_buffer_b->record_disabled);
2824 3557
3558 ret = -EBUSY;
3559 if (local_read(&cpu_buffer_a->committing))
3560 goto out_dec;
3561 if (local_read(&cpu_buffer_b->committing))
3562 goto out_dec;
3563
2825 buffer_a->buffers[cpu] = cpu_buffer_b; 3564 buffer_a->buffers[cpu] = cpu_buffer_b;
2826 buffer_b->buffers[cpu] = cpu_buffer_a; 3565 buffer_b->buffers[cpu] = cpu_buffer_a;
2827 3566
2828 cpu_buffer_b->buffer = buffer_a; 3567 cpu_buffer_b->buffer = buffer_a;
2829 cpu_buffer_a->buffer = buffer_b; 3568 cpu_buffer_a->buffer = buffer_b;
2830 3569
3570 ret = 0;
3571
3572out_dec:
2831 atomic_dec(&cpu_buffer_a->record_disabled); 3573 atomic_dec(&cpu_buffer_a->record_disabled);
2832 atomic_dec(&cpu_buffer_b->record_disabled); 3574 atomic_dec(&cpu_buffer_b->record_disabled);
2833
2834 ret = 0;
2835out: 3575out:
2836 return ret; 3576 return ret;
2837} 3577}
2838EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3578EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3579#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2839 3580
2840/** 3581/**
2841 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3582 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3008,7 +3749,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3008 read = 0; 3749 read = 0;
3009 } else { 3750 } else {
3010 /* update the entry counter */ 3751 /* update the entry counter */
3011 cpu_buffer->read += local_read(&reader->entries); 3752 cpu_buffer->read += rb_page_entries(reader);
3012 3753
3013 /* swap the pages */ 3754 /* swap the pages */
3014 rb_init_page(bpage); 3755 rb_init_page(bpage);
@@ -3029,6 +3770,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3029} 3770}
3030EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3771EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3031 3772
3773#ifdef CONFIG_TRACING
3032static ssize_t 3774static ssize_t
3033rb_simple_read(struct file *filp, char __user *ubuf, 3775rb_simple_read(struct file *filp, char __user *ubuf,
3034 size_t cnt, loff_t *ppos) 3776 size_t cnt, loff_t *ppos)
@@ -3096,6 +3838,7 @@ static __init int rb_init_debugfs(void)
3096} 3838}
3097 3839
3098fs_initcall(rb_init_debugfs); 3840fs_initcall(rb_init_debugfs);
3841#endif
3099 3842
3100#ifdef CONFIG_HOTPLUG_CPU 3843#ifdef CONFIG_HOTPLUG_CPU
3101static int rb_cpu_notify(struct notifier_block *self, 3844static int rb_cpu_notify(struct notifier_block *self,
@@ -3108,7 +3851,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3108 switch (action) { 3851 switch (action) {
3109 case CPU_UP_PREPARE: 3852 case CPU_UP_PREPARE:
3110 case CPU_UP_PREPARE_FROZEN: 3853 case CPU_UP_PREPARE_FROZEN:
3111 if (cpu_isset(cpu, *buffer->cpumask)) 3854 if (cpumask_test_cpu(cpu, buffer->cpumask))
3112 return NOTIFY_OK; 3855 return NOTIFY_OK;
3113 3856
3114 buffer->buffers[cpu] = 3857 buffer->buffers[cpu] =
@@ -3119,7 +3862,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3119 return NOTIFY_OK; 3862 return NOTIFY_OK;
3120 } 3863 }
3121 smp_wmb(); 3864 smp_wmb();
3122 cpu_set(cpu, *buffer->cpumask); 3865 cpumask_set_cpu(cpu, buffer->cpumask);
3123 break; 3866 break;
3124 case CPU_DOWN_PREPARE: 3867 case CPU_DOWN_PREPARE:
3125 case CPU_DOWN_PREPARE_FROZEN: 3868 case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8d68e149a8b3..573d3cc762c3 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -102,8 +102,10 @@ static enum event_status read_page(int cpu)
102 event = (void *)&rpage->data[i]; 102 event = (void *)&rpage->data[i];
103 switch (event->type_len) { 103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING: 104 case RINGBUF_TYPE_PADDING:
105 /* We don't expect any padding */ 105 /* failed writes may be discarded events */
106 KILL_TEST(); 106 if (!event->time_delta)
107 KILL_TEST();
108 inc = event->array[0] + 4;
107 break; 109 break;
108 case RINGBUF_TYPE_TIME_EXTEND: 110 case RINGBUF_TYPE_TIME_EXTEND:
109 inc = 8; 111 inc = 8;
@@ -119,7 +121,7 @@ static enum event_status read_page(int cpu)
119 KILL_TEST(); 121 KILL_TEST();
120 break; 122 break;
121 } 123 }
122 inc = event->array[0]; 124 inc = event->array[0] + 4;
123 break; 125 break;
124 default: 126 default:
125 entry = ring_buffer_event_data(event); 127 entry = ring_buffer_event_data(event);
@@ -201,7 +203,7 @@ static void ring_buffer_producer(void)
201 * Hammer the buffer for 10 secs (this may 203 * Hammer the buffer for 10 secs (this may
202 * make the system stall) 204 * make the system stall)
203 */ 205 */
204 pr_info("Starting ring buffer hammer\n"); 206 trace_printk("Starting ring buffer hammer\n");
205 do_gettimeofday(&start_tv); 207 do_gettimeofday(&start_tv);
206 do { 208 do {
207 struct ring_buffer_event *event; 209 struct ring_buffer_event *event;
@@ -237,7 +239,7 @@ static void ring_buffer_producer(void)
237#endif 239#endif
238 240
239 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 241 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
240 pr_info("End ring buffer hammer\n"); 242 trace_printk("End ring buffer hammer\n");
241 243
242 if (consumer) { 244 if (consumer) {
243 /* Init both completions here to avoid races */ 245 /* Init both completions here to avoid races */
@@ -260,49 +262,50 @@ static void ring_buffer_producer(void)
260 overruns = ring_buffer_overruns(buffer); 262 overruns = ring_buffer_overruns(buffer);
261 263
262 if (kill_test) 264 if (kill_test)
263 pr_info("ERROR!\n"); 265 trace_printk("ERROR!\n");
264 pr_info("Time: %lld (usecs)\n", time); 266 trace_printk("Time: %lld (usecs)\n", time);
265 pr_info("Overruns: %lld\n", overruns); 267 trace_printk("Overruns: %lld\n", overruns);
266 if (disable_reader) 268 if (disable_reader)
267 pr_info("Read: (reader disabled)\n"); 269 trace_printk("Read: (reader disabled)\n");
268 else 270 else
269 pr_info("Read: %ld (by %s)\n", read, 271 trace_printk("Read: %ld (by %s)\n", read,
270 read_events ? "events" : "pages"); 272 read_events ? "events" : "pages");
271 pr_info("Entries: %lld\n", entries); 273 trace_printk("Entries: %lld\n", entries);
272 pr_info("Total: %lld\n", entries + overruns + read); 274 trace_printk("Total: %lld\n", entries + overruns + read);
273 pr_info("Missed: %ld\n", missed); 275 trace_printk("Missed: %ld\n", missed);
274 pr_info("Hit: %ld\n", hit); 276 trace_printk("Hit: %ld\n", hit);
275 277
276 /* Convert time from usecs to millisecs */ 278 /* Convert time from usecs to millisecs */
277 do_div(time, USEC_PER_MSEC); 279 do_div(time, USEC_PER_MSEC);
278 if (time) 280 if (time)
279 hit /= (long)time; 281 hit /= (long)time;
280 else 282 else
281 pr_info("TIME IS ZERO??\n"); 283 trace_printk("TIME IS ZERO??\n");
282 284
283 pr_info("Entries per millisec: %ld\n", hit); 285 trace_printk("Entries per millisec: %ld\n", hit);
284 286
285 if (hit) { 287 if (hit) {
286 /* Calculate the average time in nanosecs */ 288 /* Calculate the average time in nanosecs */
287 avg = NSEC_PER_MSEC / hit; 289 avg = NSEC_PER_MSEC / hit;
288 pr_info("%ld ns per entry\n", avg); 290 trace_printk("%ld ns per entry\n", avg);
289 } 291 }
290 292
291 if (missed) { 293 if (missed) {
292 if (time) 294 if (time)
293 missed /= (long)time; 295 missed /= (long)time;
294 296
295 pr_info("Total iterations per millisec: %ld\n", hit + missed); 297 trace_printk("Total iterations per millisec: %ld\n",
298 hit + missed);
296 299
297 /* it is possible that hit + missed will overflow and be zero */ 300 /* it is possible that hit + missed will overflow and be zero */
298 if (!(hit + missed)) { 301 if (!(hit + missed)) {
299 pr_info("hit + missed overflowed and totalled zero!\n"); 302 trace_printk("hit + missed overflowed and totalled zero!\n");
300 hit--; /* make it non zero */ 303 hit--; /* make it non zero */
301 } 304 }
302 305
303 /* Caculate the average time in nanosecs */ 306 /* Caculate the average time in nanosecs */
304 avg = NSEC_PER_MSEC / (hit + missed); 307 avg = NSEC_PER_MSEC / (hit + missed);
305 pr_info("%ld ns per entry\n", avg); 308 trace_printk("%ld ns per entry\n", avg);
306 } 309 }
307} 310}
308 311
@@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg)
353 356
354 ring_buffer_producer(); 357 ring_buffer_producer();
355 358
356 pr_info("Sleeping for 10 secs\n"); 359 trace_printk("Sleeping for 10 secs\n");
357 set_current_state(TASK_INTERRUPTIBLE); 360 set_current_state(TASK_INTERRUPTIBLE);
358 schedule_timeout(HZ * SLEEP_TIME); 361 schedule_timeout(HZ * SLEEP_TIME);
359 __set_current_state(TASK_RUNNING); 362 __set_current_state(TASK_RUNNING);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c1878bfb2e1e..5c75deeefe30 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -42,14 +43,11 @@
42 43
43#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
44 45
45unsigned long __read_mostly tracing_max_latency;
46unsigned long __read_mostly tracing_thresh;
47
48/* 46/*
49 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
50 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
51 */ 49 */
52static int ring_buffer_expanded; 50int ring_buffer_expanded;
53 51
54/* 52/*
55 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -63,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
63/* 61/*
64 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
65 */ 63 */
66static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
67 65
68/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
69static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -88,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
88 */ 86 */
89static int tracing_disabled = 1; 87static int tracing_disabled = 1;
90 88
91static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
92 90
93static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
94{ 92{
@@ -171,10 +169,11 @@ static struct trace_array global_trace;
171 169
172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
173 171
174int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
175 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
176{ 175{
177 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
178} 177}
179EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
180 179
@@ -265,6 +264,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
265 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 264 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
266 TRACE_ITER_GRAPH_TIME; 265 TRACE_ITER_GRAPH_TIME;
267 266
267static int trace_stop_count;
268static DEFINE_SPINLOCK(tracing_start_lock);
269
268/** 270/**
269 * trace_wake_up - wake up tasks waiting for trace input 271 * trace_wake_up - wake up tasks waiting for trace input
270 * 272 *
@@ -284,13 +286,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 286static int __init set_buf_size(char *str)
285{ 287{
286 unsigned long buf_size; 288 unsigned long buf_size;
287 int ret;
288 289
289 if (!str) 290 if (!str)
290 return 0; 291 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 292 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 293 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 294 if (buf_size == 0)
294 return 0; 295 return 0;
295 trace_buf_size = buf_size; 296 trace_buf_size = buf_size;
296 return 1; 297 return 1;
@@ -323,50 +324,20 @@ static const char *trace_options[] = {
323 "printk-msg-only", 324 "printk-msg-only",
324 "context-info", 325 "context-info",
325 "latency-format", 326 "latency-format",
326 "global-clock",
327 "sleep-time", 327 "sleep-time",
328 "graph-time", 328 "graph-time",
329 NULL 329 NULL
330}; 330};
331 331
332/* 332static struct {
333 * ftrace_max_lock is used to protect the swapping of buffers 333 u64 (*func)(void);
334 * when taking a max snapshot. The buffers themselves are 334 const char *name;
335 * protected by per_cpu spinlocks. But the action of the swap 335} trace_clocks[] = {
336 * needs its own lock. 336 { trace_clock_local, "local" },
337 * 337 { trace_clock_global, "global" },
338 * This is defined as a raw_spinlock_t in order to help 338};
339 * with performance when lockdep debugging is enabled.
340 */
341static raw_spinlock_t ftrace_max_lock =
342 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
343
344/*
345 * Copy the new maximum trace into the separate maximum-trace
346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */
349static void
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
351{
352 struct trace_array_cpu *data = tr->data[cpu];
353
354 max_tr.cpu = cpu;
355 max_tr.time_start = data->preempt_timestamp;
356
357 data = max_tr.data[cpu];
358 data->saved_latency = tracing_max_latency;
359
360 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
361 data->pid = tsk->pid;
362 data->uid = task_uid(tsk);
363 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
364 data->policy = tsk->policy;
365 data->rt_priority = tsk->rt_priority;
366 339
367 /* record this tasks comm */ 340int trace_clock_id;
368 tracing_record_cmdline(tsk);
369}
370 341
371ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
372{ 343{
@@ -411,6 +382,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
411 return cnt; 382 return cnt;
412} 383}
413 384
385/*
386 * ftrace_max_lock is used to protect the swapping of buffers
387 * when taking a max snapshot. The buffers themselves are
388 * protected by per_cpu spinlocks. But the action of the swap
389 * needs its own lock.
390 *
391 * This is defined as a raw_spinlock_t in order to help
392 * with performance when lockdep debugging is enabled.
393 *
394 * It is also used in other places outside the update_max_tr
395 * so it needs to be defined outside of the
396 * CONFIG_TRACER_MAX_TRACE.
397 */
398static raw_spinlock_t ftrace_max_lock =
399 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
400
401#ifdef CONFIG_TRACER_MAX_TRACE
402unsigned long __read_mostly tracing_max_latency;
403unsigned long __read_mostly tracing_thresh;
404
405/*
406 * Copy the new maximum trace into the separate maximum-trace
407 * structure. (this way the maximum trace is permanently saved,
408 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
409 */
410static void
411__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
412{
413 struct trace_array_cpu *data = tr->data[cpu];
414 struct trace_array_cpu *max_data = tr->data[cpu];
415
416 max_tr.cpu = cpu;
417 max_tr.time_start = data->preempt_timestamp;
418
419 max_data = max_tr.data[cpu];
420 max_data->saved_latency = tracing_max_latency;
421 max_data->critical_start = data->critical_start;
422 max_data->critical_end = data->critical_end;
423
424 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
425 max_data->pid = tsk->pid;
426 max_data->uid = task_uid(tsk);
427 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
428 max_data->policy = tsk->policy;
429 max_data->rt_priority = tsk->rt_priority;
430
431 /* record this tasks comm */
432 tracing_record_cmdline(tsk);
433}
434
414/** 435/**
415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 436 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
416 * @tr: tracer 437 * @tr: tracer
@@ -425,16 +446,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
425{ 446{
426 struct ring_buffer *buf = tr->buffer; 447 struct ring_buffer *buf = tr->buffer;
427 448
449 if (trace_stop_count)
450 return;
451
428 WARN_ON_ONCE(!irqs_disabled()); 452 WARN_ON_ONCE(!irqs_disabled());
429 __raw_spin_lock(&ftrace_max_lock); 453 __raw_spin_lock(&ftrace_max_lock);
430 454
431 tr->buffer = max_tr.buffer; 455 tr->buffer = max_tr.buffer;
432 max_tr.buffer = buf; 456 max_tr.buffer = buf;
433 457
434 ftrace_disable_cpu();
435 ring_buffer_reset(tr->buffer);
436 ftrace_enable_cpu();
437
438 __update_max_tr(tr, tsk, cpu); 458 __update_max_tr(tr, tsk, cpu);
439 __raw_spin_unlock(&ftrace_max_lock); 459 __raw_spin_unlock(&ftrace_max_lock);
440} 460}
@@ -452,21 +472,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
452{ 472{
453 int ret; 473 int ret;
454 474
475 if (trace_stop_count)
476 return;
477
455 WARN_ON_ONCE(!irqs_disabled()); 478 WARN_ON_ONCE(!irqs_disabled());
456 __raw_spin_lock(&ftrace_max_lock); 479 __raw_spin_lock(&ftrace_max_lock);
457 480
458 ftrace_disable_cpu(); 481 ftrace_disable_cpu();
459 482
460 ring_buffer_reset(max_tr.buffer);
461 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 483 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
462 484
485 if (ret == -EBUSY) {
486 /*
487 * We failed to swap the buffer due to a commit taking
488 * place on this CPU. We fail to record, but we reset
489 * the max trace buffer (no one writes directly to it)
490 * and flag that it failed.
491 */
492 trace_array_printk(&max_tr, _THIS_IP_,
493 "Failed to swap buffers due to commit in progress\n");
494 }
495
463 ftrace_enable_cpu(); 496 ftrace_enable_cpu();
464 497
465 WARN_ON_ONCE(ret && ret != -EAGAIN); 498 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
466 499
467 __update_max_tr(tr, tsk, cpu); 500 __update_max_tr(tr, tsk, cpu);
468 __raw_spin_unlock(&ftrace_max_lock); 501 __raw_spin_unlock(&ftrace_max_lock);
469} 502}
503#endif /* CONFIG_TRACER_MAX_TRACE */
470 504
471/** 505/**
472 * register_tracer - register a tracer with the ftrace system. 506 * register_tracer - register a tracer with the ftrace system.
@@ -523,7 +557,6 @@ __acquires(kernel_lock)
523 if (type->selftest && !tracing_selftest_disabled) { 557 if (type->selftest && !tracing_selftest_disabled) {
524 struct tracer *saved_tracer = current_trace; 558 struct tracer *saved_tracer = current_trace;
525 struct trace_array *tr = &global_trace; 559 struct trace_array *tr = &global_trace;
526 int i;
527 560
528 /* 561 /*
529 * Run a selftest on this tracer. 562 * Run a selftest on this tracer.
@@ -532,8 +565,7 @@ __acquires(kernel_lock)
532 * internal tracing to verify that everything is in order. 565 * internal tracing to verify that everything is in order.
533 * If we fail, we do not register this tracer. 566 * If we fail, we do not register this tracer.
534 */ 567 */
535 for_each_tracing_cpu(i) 568 tracing_reset_online_cpus(tr);
536 tracing_reset(tr, i);
537 569
538 current_trace = type; 570 current_trace = type;
539 /* the test is responsible for initializing and enabling */ 571 /* the test is responsible for initializing and enabling */
@@ -546,8 +578,7 @@ __acquires(kernel_lock)
546 goto out; 578 goto out;
547 } 579 }
548 /* Only reset on passing, to avoid touching corrupted buffers */ 580 /* Only reset on passing, to avoid touching corrupted buffers */
549 for_each_tracing_cpu(i) 581 tracing_reset_online_cpus(tr);
550 tracing_reset(tr, i);
551 582
552 printk(KERN_CONT "PASSED\n"); 583 printk(KERN_CONT "PASSED\n");
553 } 584 }
@@ -622,21 +653,42 @@ void unregister_tracer(struct tracer *type)
622 mutex_unlock(&trace_types_lock); 653 mutex_unlock(&trace_types_lock);
623} 654}
624 655
625void tracing_reset(struct trace_array *tr, int cpu) 656static void __tracing_reset(struct trace_array *tr, int cpu)
626{ 657{
627 ftrace_disable_cpu(); 658 ftrace_disable_cpu();
628 ring_buffer_reset_cpu(tr->buffer, cpu); 659 ring_buffer_reset_cpu(tr->buffer, cpu);
629 ftrace_enable_cpu(); 660 ftrace_enable_cpu();
630} 661}
631 662
663void tracing_reset(struct trace_array *tr, int cpu)
664{
665 struct ring_buffer *buffer = tr->buffer;
666
667 ring_buffer_record_disable(buffer);
668
669 /* Make sure all commits have finished */
670 synchronize_sched();
671 __tracing_reset(tr, cpu);
672
673 ring_buffer_record_enable(buffer);
674}
675
632void tracing_reset_online_cpus(struct trace_array *tr) 676void tracing_reset_online_cpus(struct trace_array *tr)
633{ 677{
678 struct ring_buffer *buffer = tr->buffer;
634 int cpu; 679 int cpu;
635 680
681 ring_buffer_record_disable(buffer);
682
683 /* Make sure all commits have finished */
684 synchronize_sched();
685
636 tr->time_start = ftrace_now(tr->cpu); 686 tr->time_start = ftrace_now(tr->cpu);
637 687
638 for_each_online_cpu(cpu) 688 for_each_online_cpu(cpu)
639 tracing_reset(tr, cpu); 689 __tracing_reset(tr, cpu);
690
691 ring_buffer_record_enable(buffer);
640} 692}
641 693
642void tracing_reset_current(int cpu) 694void tracing_reset_current(int cpu)
@@ -667,9 +719,6 @@ static void trace_init_cmdlines(void)
667 cmdline_idx = 0; 719 cmdline_idx = 0;
668} 720}
669 721
670static int trace_stop_count;
671static DEFINE_SPINLOCK(tracing_start_lock);
672
673/** 722/**
674 * ftrace_off_permanent - disable all ftrace code permanently 723 * ftrace_off_permanent - disable all ftrace code permanently
675 * 724 *
@@ -848,15 +897,17 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 897 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 898 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
850} 899}
900EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
851 901
852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 902struct ring_buffer_event *
853 int type, 903trace_buffer_lock_reserve(struct ring_buffer *buffer,
854 unsigned long len, 904 int type,
855 unsigned long flags, int pc) 905 unsigned long len,
906 unsigned long flags, int pc)
856{ 907{
857 struct ring_buffer_event *event; 908 struct ring_buffer_event *event;
858 909
859 event = ring_buffer_lock_reserve(tr->buffer, len); 910 event = ring_buffer_lock_reserve(buffer, len);
860 if (event != NULL) { 911 if (event != NULL) {
861 struct trace_entry *ent = ring_buffer_event_data(event); 912 struct trace_entry *ent = ring_buffer_event_data(event);
862 913
@@ -866,58 +917,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
866 917
867 return event; 918 return event;
868} 919}
869static void ftrace_trace_stack(struct trace_array *tr,
870 unsigned long flags, int skip, int pc);
871static void ftrace_trace_userstack(struct trace_array *tr,
872 unsigned long flags, int pc);
873 920
874static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 921static inline void
875 struct ring_buffer_event *event, 922__trace_buffer_unlock_commit(struct ring_buffer *buffer,
876 unsigned long flags, int pc, 923 struct ring_buffer_event *event,
877 int wake) 924 unsigned long flags, int pc,
925 int wake)
878{ 926{
879 ring_buffer_unlock_commit(tr->buffer, event); 927 ring_buffer_unlock_commit(buffer, event);
880 928
881 ftrace_trace_stack(tr, flags, 6, pc); 929 ftrace_trace_stack(buffer, flags, 6, pc);
882 ftrace_trace_userstack(tr, flags, pc); 930 ftrace_trace_userstack(buffer, flags, pc);
883 931
884 if (wake) 932 if (wake)
885 trace_wake_up(); 933 trace_wake_up();
886} 934}
887 935
888void trace_buffer_unlock_commit(struct trace_array *tr, 936void trace_buffer_unlock_commit(struct ring_buffer *buffer,
889 struct ring_buffer_event *event, 937 struct ring_buffer_event *event,
890 unsigned long flags, int pc) 938 unsigned long flags, int pc)
891{ 939{
892 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 940 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
893} 941}
894 942
895struct ring_buffer_event * 943struct ring_buffer_event *
896trace_current_buffer_lock_reserve(int type, unsigned long len, 944trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
945 int type, unsigned long len,
897 unsigned long flags, int pc) 946 unsigned long flags, int pc)
898{ 947{
899 return trace_buffer_lock_reserve(&global_trace, 948 *current_rb = global_trace.buffer;
949 return trace_buffer_lock_reserve(*current_rb,
900 type, len, flags, pc); 950 type, len, flags, pc);
901} 951}
902EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 952EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
903 953
904void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 954void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
955 struct ring_buffer_event *event,
905 unsigned long flags, int pc) 956 unsigned long flags, int pc)
906{ 957{
907 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 958 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
908} 959}
909EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 960EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
910 961
911void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 962void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
912 unsigned long flags, int pc) 963 struct ring_buffer_event *event,
964 unsigned long flags, int pc)
913{ 965{
914 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 966 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
915} 967}
916EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 968EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
917 969
918void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 970void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
971 struct ring_buffer_event *event)
919{ 972{
920 ring_buffer_discard_commit(global_trace.buffer, event); 973 ring_buffer_discard_commit(buffer, event);
921} 974}
922EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 975EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
923 976
@@ -927,6 +980,7 @@ trace_function(struct trace_array *tr,
927 int pc) 980 int pc)
928{ 981{
929 struct ftrace_event_call *call = &event_function; 982 struct ftrace_event_call *call = &event_function;
983 struct ring_buffer *buffer = tr->buffer;
930 struct ring_buffer_event *event; 984 struct ring_buffer_event *event;
931 struct ftrace_entry *entry; 985 struct ftrace_entry *entry;
932 986
@@ -934,7 +988,7 @@ trace_function(struct trace_array *tr,
934 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 988 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
935 return; 989 return;
936 990
937 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 991 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
938 flags, pc); 992 flags, pc);
939 if (!event) 993 if (!event)
940 return; 994 return;
@@ -942,57 +996,9 @@ trace_function(struct trace_array *tr,
942 entry->ip = ip; 996 entry->ip = ip;
943 entry->parent_ip = parent_ip; 997 entry->parent_ip = parent_ip;
944 998
945 if (!filter_check_discard(call, entry, tr->buffer, event)) 999 if (!filter_check_discard(call, entry, buffer, event))
946 ring_buffer_unlock_commit(tr->buffer, event); 1000 ring_buffer_unlock_commit(buffer, event);
947}
948
949#ifdef CONFIG_FUNCTION_GRAPH_TRACER
950static int __trace_graph_entry(struct trace_array *tr,
951 struct ftrace_graph_ent *trace,
952 unsigned long flags,
953 int pc)
954{
955 struct ftrace_event_call *call = &event_funcgraph_entry;
956 struct ring_buffer_event *event;
957 struct ftrace_graph_ent_entry *entry;
958
959 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
960 return 0;
961
962 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
963 sizeof(*entry), flags, pc);
964 if (!event)
965 return 0;
966 entry = ring_buffer_event_data(event);
967 entry->graph_ent = *trace;
968 if (!filter_current_check_discard(call, entry, event))
969 ring_buffer_unlock_commit(global_trace.buffer, event);
970
971 return 1;
972}
973
974static void __trace_graph_return(struct trace_array *tr,
975 struct ftrace_graph_ret *trace,
976 unsigned long flags,
977 int pc)
978{
979 struct ftrace_event_call *call = &event_funcgraph_exit;
980 struct ring_buffer_event *event;
981 struct ftrace_graph_ret_entry *entry;
982
983 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
984 return;
985
986 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
987 sizeof(*entry), flags, pc);
988 if (!event)
989 return;
990 entry = ring_buffer_event_data(event);
991 entry->ret = *trace;
992 if (!filter_current_check_discard(call, entry, event))
993 ring_buffer_unlock_commit(global_trace.buffer, event);
994} 1001}
995#endif
996 1002
997void 1003void
998ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1004ftrace(struct trace_array *tr, struct trace_array_cpu *data,
@@ -1003,17 +1009,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1003 trace_function(tr, ip, parent_ip, flags, pc); 1009 trace_function(tr, ip, parent_ip, flags, pc);
1004} 1010}
1005 1011
1006static void __ftrace_trace_stack(struct trace_array *tr, 1012#ifdef CONFIG_STACKTRACE
1013static void __ftrace_trace_stack(struct ring_buffer *buffer,
1007 unsigned long flags, 1014 unsigned long flags,
1008 int skip, int pc) 1015 int skip, int pc)
1009{ 1016{
1010#ifdef CONFIG_STACKTRACE
1011 struct ftrace_event_call *call = &event_kernel_stack; 1017 struct ftrace_event_call *call = &event_kernel_stack;
1012 struct ring_buffer_event *event; 1018 struct ring_buffer_event *event;
1013 struct stack_entry *entry; 1019 struct stack_entry *entry;
1014 struct stack_trace trace; 1020 struct stack_trace trace;
1015 1021
1016 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1022 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1017 sizeof(*entry), flags, pc); 1023 sizeof(*entry), flags, pc);
1018 if (!event) 1024 if (!event)
1019 return; 1025 return;
@@ -1026,32 +1032,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1026 trace.entries = entry->caller; 1032 trace.entries = entry->caller;
1027 1033
1028 save_stack_trace(&trace); 1034 save_stack_trace(&trace);
1029 if (!filter_check_discard(call, entry, tr->buffer, event)) 1035 if (!filter_check_discard(call, entry, buffer, event))
1030 ring_buffer_unlock_commit(tr->buffer, event); 1036 ring_buffer_unlock_commit(buffer, event);
1031#endif
1032} 1037}
1033 1038
1034static void ftrace_trace_stack(struct trace_array *tr, 1039void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1035 unsigned long flags, 1040 int skip, int pc)
1036 int skip, int pc)
1037{ 1041{
1038 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1042 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1039 return; 1043 return;
1040 1044
1041 __ftrace_trace_stack(tr, flags, skip, pc); 1045 __ftrace_trace_stack(buffer, flags, skip, pc);
1042} 1046}
1043 1047
1044void __trace_stack(struct trace_array *tr, 1048void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1045 unsigned long flags, 1049 int pc)
1046 int skip, int pc)
1047{ 1050{
1048 __ftrace_trace_stack(tr, flags, skip, pc); 1051 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1049} 1052}
1050 1053
1051static void ftrace_trace_userstack(struct trace_array *tr, 1054void
1052 unsigned long flags, int pc) 1055ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1053{ 1056{
1054#ifdef CONFIG_STACKTRACE
1055 struct ftrace_event_call *call = &event_user_stack; 1057 struct ftrace_event_call *call = &event_user_stack;
1056 struct ring_buffer_event *event; 1058 struct ring_buffer_event *event;
1057 struct userstack_entry *entry; 1059 struct userstack_entry *entry;
@@ -1060,7 +1062,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1060 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1062 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1061 return; 1063 return;
1062 1064
1063 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1065 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1064 sizeof(*entry), flags, pc); 1066 sizeof(*entry), flags, pc);
1065 if (!event) 1067 if (!event)
1066 return; 1068 return;
@@ -1074,9 +1076,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1074 trace.entries = entry->caller; 1076 trace.entries = entry->caller;
1075 1077
1076 save_stack_trace_user(&trace); 1078 save_stack_trace_user(&trace);
1077 if (!filter_check_discard(call, entry, tr->buffer, event)) 1079 if (!filter_check_discard(call, entry, buffer, event))
1078 ring_buffer_unlock_commit(tr->buffer, event); 1080 ring_buffer_unlock_commit(buffer, event);
1079#endif
1080} 1081}
1081 1082
1082#ifdef UNUSED 1083#ifdef UNUSED
@@ -1086,6 +1087,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1086} 1087}
1087#endif /* UNUSED */ 1088#endif /* UNUSED */
1088 1089
1090#endif /* CONFIG_STACKTRACE */
1091
1089static void 1092static void
1090ftrace_trace_special(void *__tr, 1093ftrace_trace_special(void *__tr,
1091 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1094 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -1093,9 +1096,10 @@ ftrace_trace_special(void *__tr,
1093{ 1096{
1094 struct ring_buffer_event *event; 1097 struct ring_buffer_event *event;
1095 struct trace_array *tr = __tr; 1098 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer;
1096 struct special_entry *entry; 1100 struct special_entry *entry;
1097 1101
1098 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1102 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1099 sizeof(*entry), 0, pc); 1103 sizeof(*entry), 0, pc);
1100 if (!event) 1104 if (!event)
1101 return; 1105 return;
@@ -1103,7 +1107,7 @@ ftrace_trace_special(void *__tr,
1103 entry->arg1 = arg1; 1107 entry->arg1 = arg1;
1104 entry->arg2 = arg2; 1108 entry->arg2 = arg2;
1105 entry->arg3 = arg3; 1109 entry->arg3 = arg3;
1106 trace_buffer_unlock_commit(tr, event, 0, pc); 1110 trace_buffer_unlock_commit(buffer, event, 0, pc);
1107} 1111}
1108 1112
1109void 1113void
@@ -1114,62 +1118,6 @@ __trace_special(void *__tr, void *__data,
1114} 1118}
1115 1119
1116void 1120void
1117tracing_sched_switch_trace(struct trace_array *tr,
1118 struct task_struct *prev,
1119 struct task_struct *next,
1120 unsigned long flags, int pc)
1121{
1122 struct ftrace_event_call *call = &event_context_switch;
1123 struct ring_buffer_event *event;
1124 struct ctx_switch_entry *entry;
1125
1126 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1127 sizeof(*entry), flags, pc);
1128 if (!event)
1129 return;
1130 entry = ring_buffer_event_data(event);
1131 entry->prev_pid = prev->pid;
1132 entry->prev_prio = prev->prio;
1133 entry->prev_state = prev->state;
1134 entry->next_pid = next->pid;
1135 entry->next_prio = next->prio;
1136 entry->next_state = next->state;
1137 entry->next_cpu = task_cpu(next);
1138
1139 if (!filter_check_discard(call, entry, tr->buffer, event))
1140 trace_buffer_unlock_commit(tr, event, flags, pc);
1141}
1142
1143void
1144tracing_sched_wakeup_trace(struct trace_array *tr,
1145 struct task_struct *wakee,
1146 struct task_struct *curr,
1147 unsigned long flags, int pc)
1148{
1149 struct ftrace_event_call *call = &event_wakeup;
1150 struct ring_buffer_event *event;
1151 struct ctx_switch_entry *entry;
1152
1153 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1154 sizeof(*entry), flags, pc);
1155 if (!event)
1156 return;
1157 entry = ring_buffer_event_data(event);
1158 entry->prev_pid = curr->pid;
1159 entry->prev_prio = curr->prio;
1160 entry->prev_state = curr->state;
1161 entry->next_pid = wakee->pid;
1162 entry->next_prio = wakee->prio;
1163 entry->next_state = wakee->state;
1164 entry->next_cpu = task_cpu(wakee);
1165
1166 if (!filter_check_discard(call, entry, tr->buffer, event))
1167 ring_buffer_unlock_commit(tr->buffer, event);
1168 ftrace_trace_stack(tr, flags, 6, pc);
1169 ftrace_trace_userstack(tr, flags, pc);
1170}
1171
1172void
1173ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1121ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1174{ 1122{
1175 struct trace_array *tr = &global_trace; 1123 struct trace_array *tr = &global_trace;
@@ -1193,68 +1141,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1193 local_irq_restore(flags); 1141 local_irq_restore(flags);
1194} 1142}
1195 1143
1196#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1197int trace_graph_entry(struct ftrace_graph_ent *trace)
1198{
1199 struct trace_array *tr = &global_trace;
1200 struct trace_array_cpu *data;
1201 unsigned long flags;
1202 long disabled;
1203 int ret;
1204 int cpu;
1205 int pc;
1206
1207 if (!ftrace_trace_task(current))
1208 return 0;
1209
1210 if (!ftrace_graph_addr(trace->func))
1211 return 0;
1212
1213 local_irq_save(flags);
1214 cpu = raw_smp_processor_id();
1215 data = tr->data[cpu];
1216 disabled = atomic_inc_return(&data->disabled);
1217 if (likely(disabled == 1)) {
1218 pc = preempt_count();
1219 ret = __trace_graph_entry(tr, trace, flags, pc);
1220 } else {
1221 ret = 0;
1222 }
1223 /* Only do the atomic if it is not already set */
1224 if (!test_tsk_trace_graph(current))
1225 set_tsk_trace_graph(current);
1226
1227 atomic_dec(&data->disabled);
1228 local_irq_restore(flags);
1229
1230 return ret;
1231}
1232
1233void trace_graph_return(struct ftrace_graph_ret *trace)
1234{
1235 struct trace_array *tr = &global_trace;
1236 struct trace_array_cpu *data;
1237 unsigned long flags;
1238 long disabled;
1239 int cpu;
1240 int pc;
1241
1242 local_irq_save(flags);
1243 cpu = raw_smp_processor_id();
1244 data = tr->data[cpu];
1245 disabled = atomic_inc_return(&data->disabled);
1246 if (likely(disabled == 1)) {
1247 pc = preempt_count();
1248 __trace_graph_return(tr, trace, flags, pc);
1249 }
1250 if (!trace->depth)
1251 clear_tsk_trace_graph(current);
1252 atomic_dec(&data->disabled);
1253 local_irq_restore(flags);
1254}
1255#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1256
1257
1258/** 1144/**
1259 * trace_vbprintk - write binary msg to tracing buffer 1145 * trace_vbprintk - write binary msg to tracing buffer
1260 * 1146 *
@@ -1267,6 +1153,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1267 1153
1268 struct ftrace_event_call *call = &event_bprint; 1154 struct ftrace_event_call *call = &event_bprint;
1269 struct ring_buffer_event *event; 1155 struct ring_buffer_event *event;
1156 struct ring_buffer *buffer;
1270 struct trace_array *tr = &global_trace; 1157 struct trace_array *tr = &global_trace;
1271 struct trace_array_cpu *data; 1158 struct trace_array_cpu *data;
1272 struct bprint_entry *entry; 1159 struct bprint_entry *entry;
@@ -1299,7 +1186,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1299 goto out_unlock; 1186 goto out_unlock;
1300 1187
1301 size = sizeof(*entry) + sizeof(u32) * len; 1188 size = sizeof(*entry) + sizeof(u32) * len;
1302 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1189 buffer = tr->buffer;
1190 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1191 flags, pc);
1303 if (!event) 1192 if (!event)
1304 goto out_unlock; 1193 goto out_unlock;
1305 entry = ring_buffer_event_data(event); 1194 entry = ring_buffer_event_data(event);
@@ -1307,8 +1196,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1307 entry->fmt = fmt; 1196 entry->fmt = fmt;
1308 1197
1309 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1198 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1310 if (!filter_check_discard(call, entry, tr->buffer, event)) 1199 if (!filter_check_discard(call, entry, buffer, event))
1311 ring_buffer_unlock_commit(tr->buffer, event); 1200 ring_buffer_unlock_commit(buffer, event);
1312 1201
1313out_unlock: 1202out_unlock:
1314 __raw_spin_unlock(&trace_buf_lock); 1203 __raw_spin_unlock(&trace_buf_lock);
@@ -1323,14 +1212,30 @@ out:
1323} 1212}
1324EXPORT_SYMBOL_GPL(trace_vbprintk); 1213EXPORT_SYMBOL_GPL(trace_vbprintk);
1325 1214
1326int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1215int trace_array_printk(struct trace_array *tr,
1216 unsigned long ip, const char *fmt, ...)
1217{
1218 int ret;
1219 va_list ap;
1220
1221 if (!(trace_flags & TRACE_ITER_PRINTK))
1222 return 0;
1223
1224 va_start(ap, fmt);
1225 ret = trace_array_vprintk(tr, ip, fmt, ap);
1226 va_end(ap);
1227 return ret;
1228}
1229
1230int trace_array_vprintk(struct trace_array *tr,
1231 unsigned long ip, const char *fmt, va_list args)
1327{ 1232{
1328 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1233 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1329 static char trace_buf[TRACE_BUF_SIZE]; 1234 static char trace_buf[TRACE_BUF_SIZE];
1330 1235
1331 struct ftrace_event_call *call = &event_print; 1236 struct ftrace_event_call *call = &event_print;
1332 struct ring_buffer_event *event; 1237 struct ring_buffer_event *event;
1333 struct trace_array *tr = &global_trace; 1238 struct ring_buffer *buffer;
1334 struct trace_array_cpu *data; 1239 struct trace_array_cpu *data;
1335 int cpu, len = 0, size, pc; 1240 int cpu, len = 0, size, pc;
1336 struct print_entry *entry; 1241 struct print_entry *entry;
@@ -1358,7 +1263,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1358 trace_buf[len] = 0; 1263 trace_buf[len] = 0;
1359 1264
1360 size = sizeof(*entry) + len + 1; 1265 size = sizeof(*entry) + len + 1;
1361 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1266 buffer = tr->buffer;
1267 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1268 irq_flags, pc);
1362 if (!event) 1269 if (!event)
1363 goto out_unlock; 1270 goto out_unlock;
1364 entry = ring_buffer_event_data(event); 1271 entry = ring_buffer_event_data(event);
@@ -1366,8 +1273,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1366 1273
1367 memcpy(&entry->buf, trace_buf, len); 1274 memcpy(&entry->buf, trace_buf, len);
1368 entry->buf[len] = 0; 1275 entry->buf[len] = 0;
1369 if (!filter_check_discard(call, entry, tr->buffer, event)) 1276 if (!filter_check_discard(call, entry, buffer, event))
1370 ring_buffer_unlock_commit(tr->buffer, event); 1277 ring_buffer_unlock_commit(buffer, event);
1371 1278
1372 out_unlock: 1279 out_unlock:
1373 __raw_spin_unlock(&trace_buf_lock); 1280 __raw_spin_unlock(&trace_buf_lock);
@@ -1379,6 +1286,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1379 1286
1380 return len; 1287 return len;
1381} 1288}
1289
1290int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1291{
1292 return trace_array_printk(&global_trace, ip, fmt, args);
1293}
1382EXPORT_SYMBOL_GPL(trace_vprintk); 1294EXPORT_SYMBOL_GPL(trace_vprintk);
1383 1295
1384enum trace_file_type { 1296enum trace_file_type {
@@ -1518,6 +1430,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1518 return ent; 1430 return ent;
1519} 1431}
1520 1432
1433static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1434{
1435 struct trace_array *tr = iter->tr;
1436 struct ring_buffer_event *event;
1437 struct ring_buffer_iter *buf_iter;
1438 unsigned long entries = 0;
1439 u64 ts;
1440
1441 tr->data[cpu]->skipped_entries = 0;
1442
1443 if (!iter->buffer_iter[cpu])
1444 return;
1445
1446 buf_iter = iter->buffer_iter[cpu];
1447 ring_buffer_iter_reset(buf_iter);
1448
1449 /*
1450 * We could have the case with the max latency tracers
1451 * that a reset never took place on a cpu. This is evident
1452 * by the timestamp being before the start of the buffer.
1453 */
1454 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1455 if (ts >= iter->tr->time_start)
1456 break;
1457 entries++;
1458 ring_buffer_read(buf_iter, NULL);
1459 }
1460
1461 tr->data[cpu]->skipped_entries = entries;
1462}
1463
1521/* 1464/*
1522 * No necessary locking here. The worst thing which can 1465 * No necessary locking here. The worst thing which can
1523 * happen is loosing events consumed at the same time 1466 * happen is loosing events consumed at the same time
@@ -1556,10 +1499,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1556 1499
1557 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1500 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1558 for_each_tracing_cpu(cpu) 1501 for_each_tracing_cpu(cpu)
1559 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1502 tracing_iter_reset(iter, cpu);
1560 } else 1503 } else
1561 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1504 tracing_iter_reset(iter, cpu_file);
1562
1563 1505
1564 ftrace_enable_cpu(); 1506 ftrace_enable_cpu();
1565 1507
@@ -1608,16 +1550,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1608 struct trace_array *tr = iter->tr; 1550 struct trace_array *tr = iter->tr;
1609 struct trace_array_cpu *data = tr->data[tr->cpu]; 1551 struct trace_array_cpu *data = tr->data[tr->cpu];
1610 struct tracer *type = current_trace; 1552 struct tracer *type = current_trace;
1611 unsigned long total; 1553 unsigned long entries = 0;
1612 unsigned long entries; 1554 unsigned long total = 0;
1555 unsigned long count;
1613 const char *name = "preemption"; 1556 const char *name = "preemption";
1557 int cpu;
1614 1558
1615 if (type) 1559 if (type)
1616 name = type->name; 1560 name = type->name;
1617 1561
1618 entries = ring_buffer_entries(iter->tr->buffer); 1562
1619 total = entries + 1563 for_each_tracing_cpu(cpu) {
1620 ring_buffer_overruns(iter->tr->buffer); 1564 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1565 /*
1566 * If this buffer has skipped entries, then we hold all
1567 * entries for the trace and we need to ignore the
1568 * ones before the time stamp.
1569 */
1570 if (tr->data[cpu]->skipped_entries) {
1571 count -= tr->data[cpu]->skipped_entries;
1572 /* total is the same as the entries */
1573 total += count;
1574 } else
1575 total += count +
1576 ring_buffer_overrun_cpu(tr->buffer, cpu);
1577 entries += count;
1578 }
1621 1579
1622 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1580 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1623 name, UTS_RELEASE); 1581 name, UTS_RELEASE);
@@ -1659,7 +1617,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1659 seq_puts(m, "\n# => ended at: "); 1617 seq_puts(m, "\n# => ended at: ");
1660 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1618 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1661 trace_print_seq(m, &iter->seq); 1619 trace_print_seq(m, &iter->seq);
1662 seq_puts(m, "#\n"); 1620 seq_puts(m, "\n#\n");
1663 } 1621 }
1664 1622
1665 seq_puts(m, "#\n"); 1623 seq_puts(m, "#\n");
@@ -1678,6 +1636,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1678 if (cpumask_test_cpu(iter->cpu, iter->started)) 1636 if (cpumask_test_cpu(iter->cpu, iter->started))
1679 return; 1637 return;
1680 1638
1639 if (iter->tr->data[iter->cpu]->skipped_entries)
1640 return;
1641
1681 cpumask_set_cpu(iter->cpu, iter->started); 1642 cpumask_set_cpu(iter->cpu, iter->started);
1682 1643
1683 /* Don't print started cpu buffer for the first entry of the trace */ 1644 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1940,19 +1901,23 @@ __tracing_open(struct inode *inode, struct file *file)
1940 if (ring_buffer_overruns(iter->tr->buffer)) 1901 if (ring_buffer_overruns(iter->tr->buffer))
1941 iter->iter_flags |= TRACE_FILE_ANNOTATE; 1902 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1942 1903
1904 /* stop the trace while dumping */
1905 tracing_stop();
1906
1943 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 1907 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1944 for_each_tracing_cpu(cpu) { 1908 for_each_tracing_cpu(cpu) {
1945 1909
1946 iter->buffer_iter[cpu] = 1910 iter->buffer_iter[cpu] =
1947 ring_buffer_read_start(iter->tr->buffer, cpu); 1911 ring_buffer_read_start(iter->tr->buffer, cpu);
1912 tracing_iter_reset(iter, cpu);
1948 } 1913 }
1949 } else { 1914 } else {
1950 cpu = iter->cpu_file; 1915 cpu = iter->cpu_file;
1951 iter->buffer_iter[cpu] = 1916 iter->buffer_iter[cpu] =
1952 ring_buffer_read_start(iter->tr->buffer, cpu); 1917 ring_buffer_read_start(iter->tr->buffer, cpu);
1918 tracing_iter_reset(iter, cpu);
1953 } 1919 }
1954 1920
1955 /* TODO stop tracer */
1956 ret = seq_open(file, &tracer_seq_ops); 1921 ret = seq_open(file, &tracer_seq_ops);
1957 if (ret < 0) { 1922 if (ret < 0) {
1958 fail_ret = ERR_PTR(ret); 1923 fail_ret = ERR_PTR(ret);
@@ -1962,9 +1927,6 @@ __tracing_open(struct inode *inode, struct file *file)
1962 m = file->private_data; 1927 m = file->private_data;
1963 m->private = iter; 1928 m->private = iter;
1964 1929
1965 /* stop the trace while dumping */
1966 tracing_stop();
1967
1968 mutex_unlock(&trace_types_lock); 1930 mutex_unlock(&trace_types_lock);
1969 1931
1970 return iter; 1932 return iter;
@@ -1975,6 +1937,7 @@ __tracing_open(struct inode *inode, struct file *file)
1975 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1937 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1976 } 1938 }
1977 free_cpumask_var(iter->started); 1939 free_cpumask_var(iter->started);
1940 tracing_start();
1978 fail: 1941 fail:
1979 mutex_unlock(&trace_types_lock); 1942 mutex_unlock(&trace_types_lock);
1980 kfree(iter->trace); 1943 kfree(iter->trace);
@@ -2031,7 +1994,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2031 1994
2032 /* If this file was open for write, then erase contents */ 1995 /* If this file was open for write, then erase contents */
2033 if ((file->f_mode & FMODE_WRITE) && 1996 if ((file->f_mode & FMODE_WRITE) &&
2034 !(file->f_flags & O_APPEND)) { 1997 (file->f_flags & O_TRUNC)) {
2035 long cpu = (long) inode->i_private; 1998 long cpu = (long) inode->i_private;
2036 1999
2037 if (cpu == TRACE_PIPE_ALL_CPU) 2000 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2053,25 +2016,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2016static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2017t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2018{
2056 struct tracer *t = m->private; 2019 struct tracer *t = v;
2057 2020
2058 (*pos)++; 2021 (*pos)++;
2059 2022
2060 if (t) 2023 if (t)
2061 t = t->next; 2024 t = t->next;
2062 2025
2063 m->private = t;
2064
2065 return t; 2026 return t;
2066} 2027}
2067 2028
2068static void *t_start(struct seq_file *m, loff_t *pos) 2029static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2030{
2070 struct tracer *t = m->private; 2031 struct tracer *t;
2071 loff_t l = 0; 2032 loff_t l = 0;
2072 2033
2073 mutex_lock(&trace_types_lock); 2034 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2035 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2036 ;
2076 2037
2077 return t; 2038 return t;
@@ -2107,18 +2068,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2068
2108static int show_traces_open(struct inode *inode, struct file *file) 2069static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2070{
2110 int ret;
2111
2112 if (tracing_disabled) 2071 if (tracing_disabled)
2113 return -ENODEV; 2072 return -ENODEV;
2114 2073
2115 ret = seq_open(file, &show_traces_seq_ops); 2074 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2075}
2123 2076
2124static ssize_t 2077static ssize_t
@@ -2191,11 +2144,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2191 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 2144 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2192 return -ENOMEM; 2145 return -ENOMEM;
2193 2146
2194 mutex_lock(&tracing_cpumask_update_lock);
2195 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2147 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2196 if (err) 2148 if (err)
2197 goto err_unlock; 2149 goto err_unlock;
2198 2150
2151 mutex_lock(&tracing_cpumask_update_lock);
2152
2199 local_irq_disable(); 2153 local_irq_disable();
2200 __raw_spin_lock(&ftrace_max_lock); 2154 __raw_spin_lock(&ftrace_max_lock);
2201 for_each_tracing_cpu(cpu) { 2155 for_each_tracing_cpu(cpu) {
@@ -2223,8 +2177,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2223 return count; 2177 return count;
2224 2178
2225err_unlock: 2179err_unlock:
2226 mutex_unlock(&tracing_cpumask_update_lock); 2180 free_cpumask_var(tracing_cpumask_new);
2227 free_cpumask_var(tracing_cpumask);
2228 2181
2229 return err; 2182 return err;
2230} 2183}
@@ -2266,8 +2219,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2266 len += 3; /* "no" and newline */ 2219 len += 3; /* "no" and newline */
2267 } 2220 }
2268 2221
2269 /* +2 for \n and \0 */ 2222 /* +1 for \0 */
2270 buf = kmalloc(len + 2, GFP_KERNEL); 2223 buf = kmalloc(len + 1, GFP_KERNEL);
2271 if (!buf) { 2224 if (!buf) {
2272 mutex_unlock(&trace_types_lock); 2225 mutex_unlock(&trace_types_lock);
2273 return -ENOMEM; 2226 return -ENOMEM;
@@ -2290,7 +2243,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2290 } 2243 }
2291 mutex_unlock(&trace_types_lock); 2244 mutex_unlock(&trace_types_lock);
2292 2245
2293 WARN_ON(r >= len + 2); 2246 WARN_ON(r >= len + 1);
2294 2247
2295 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2248 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2296 2249
@@ -2301,23 +2254,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2301/* Try to assign a tracer specific option */ 2254/* Try to assign a tracer specific option */
2302static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2255static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2303{ 2256{
2304 struct tracer_flags *trace_flags = trace->flags; 2257 struct tracer_flags *tracer_flags = trace->flags;
2305 struct tracer_opt *opts = NULL; 2258 struct tracer_opt *opts = NULL;
2306 int ret = 0, i = 0; 2259 int ret = 0, i = 0;
2307 int len; 2260 int len;
2308 2261
2309 for (i = 0; trace_flags->opts[i].name; i++) { 2262 for (i = 0; tracer_flags->opts[i].name; i++) {
2310 opts = &trace_flags->opts[i]; 2263 opts = &tracer_flags->opts[i];
2311 len = strlen(opts->name); 2264 len = strlen(opts->name);
2312 2265
2313 if (strncmp(cmp, opts->name, len) == 0) { 2266 if (strncmp(cmp, opts->name, len) == 0) {
2314 ret = trace->set_flag(trace_flags->val, 2267 ret = trace->set_flag(tracer_flags->val,
2315 opts->bit, !neg); 2268 opts->bit, !neg);
2316 break; 2269 break;
2317 } 2270 }
2318 } 2271 }
2319 /* Not found */ 2272 /* Not found */
2320 if (!trace_flags->opts[i].name) 2273 if (!tracer_flags->opts[i].name)
2321 return -EINVAL; 2274 return -EINVAL;
2322 2275
2323 /* Refused to handle */ 2276 /* Refused to handle */
@@ -2325,9 +2278,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2325 return ret; 2278 return ret;
2326 2279
2327 if (neg) 2280 if (neg)
2328 trace_flags->val &= ~opts->bit; 2281 tracer_flags->val &= ~opts->bit;
2329 else 2282 else
2330 trace_flags->val |= opts->bit; 2283 tracer_flags->val |= opts->bit;
2331 2284
2332 return 0; 2285 return 0;
2333} 2286}
@@ -2342,22 +2295,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2342 trace_flags |= mask; 2295 trace_flags |= mask;
2343 else 2296 else
2344 trace_flags &= ~mask; 2297 trace_flags &= ~mask;
2345
2346 if (mask == TRACE_ITER_GLOBAL_CLK) {
2347 u64 (*func)(void);
2348
2349 if (enabled)
2350 func = trace_clock_global;
2351 else
2352 func = trace_clock_local;
2353
2354 mutex_lock(&trace_types_lock);
2355 ring_buffer_set_clock(global_trace.buffer, func);
2356
2357 if (max_tr.buffer)
2358 ring_buffer_set_clock(max_tr.buffer, func);
2359 mutex_unlock(&trace_types_lock);
2360 }
2361} 2298}
2362 2299
2363static ssize_t 2300static ssize_t
@@ -3095,7 +3032,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3095 break; 3032 break;
3096 } 3033 }
3097 3034
3098 trace_consume(iter); 3035 if (ret != TRACE_TYPE_NO_CONSUME)
3036 trace_consume(iter);
3099 rem -= count; 3037 rem -= count;
3100 if (!find_next_entry_inc(iter)) { 3038 if (!find_next_entry_inc(iter)) {
3101 rem = 0; 3039 rem = 0;
@@ -3324,6 +3262,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3324 return cnt; 3262 return cnt;
3325} 3263}
3326 3264
3265static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3266 size_t cnt, loff_t *ppos)
3267{
3268 char buf[64];
3269 int bufiter = 0;
3270 int i;
3271
3272 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3273 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3274 "%s%s%s%s", i ? " " : "",
3275 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3276 i == trace_clock_id ? "]" : "");
3277 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3278
3279 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3280}
3281
3282static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3283 size_t cnt, loff_t *fpos)
3284{
3285 char buf[64];
3286 const char *clockstr;
3287 int i;
3288
3289 if (cnt >= sizeof(buf))
3290 return -EINVAL;
3291
3292 if (copy_from_user(&buf, ubuf, cnt))
3293 return -EFAULT;
3294
3295 buf[cnt] = 0;
3296
3297 clockstr = strstrip(buf);
3298
3299 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3300 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3301 break;
3302 }
3303 if (i == ARRAY_SIZE(trace_clocks))
3304 return -EINVAL;
3305
3306 trace_clock_id = i;
3307
3308 mutex_lock(&trace_types_lock);
3309
3310 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3311 if (max_tr.buffer)
3312 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3313
3314 mutex_unlock(&trace_types_lock);
3315
3316 *fpos += cnt;
3317
3318 return cnt;
3319}
3320
3327static const struct file_operations tracing_max_lat_fops = { 3321static const struct file_operations tracing_max_lat_fops = {
3328 .open = tracing_open_generic, 3322 .open = tracing_open_generic,
3329 .read = tracing_max_lat_read, 3323 .read = tracing_max_lat_read,
@@ -3361,6 +3355,12 @@ static const struct file_operations tracing_mark_fops = {
3361 .write = tracing_mark_write, 3355 .write = tracing_mark_write,
3362}; 3356};
3363 3357
3358static const struct file_operations trace_clock_fops = {
3359 .open = tracing_open_generic,
3360 .read = tracing_clock_read,
3361 .write = tracing_clock_write,
3362};
3363
3364struct ftrace_buffer_info { 3364struct ftrace_buffer_info {
3365 struct trace_array *tr; 3365 struct trace_array *tr;
3366 void *spare; 3366 void *spare;
@@ -3626,7 +3626,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3626 struct trace_seq *s; 3626 struct trace_seq *s;
3627 unsigned long cnt; 3627 unsigned long cnt;
3628 3628
3629 s = kmalloc(sizeof(*s), GFP_ATOMIC); 3629 s = kmalloc(sizeof(*s), GFP_KERNEL);
3630 if (!s) 3630 if (!s)
3631 return ENOMEM; 3631 return ENOMEM;
3632 3632
@@ -3641,9 +3641,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3641 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3641 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3642 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3642 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3643 3643
3644 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3645 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3646
3647 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3644 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3648 3645
3649 kfree(s); 3646 kfree(s);
@@ -3904,17 +3901,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3904 if (ret < 0) 3901 if (ret < 0)
3905 return ret; 3902 return ret;
3906 3903
3907 switch (val) { 3904 if (val != 0 && val != 1)
3908 case 0:
3909 trace_flags &= ~(1 << index);
3910 break;
3911 case 1:
3912 trace_flags |= 1 << index;
3913 break;
3914
3915 default:
3916 return -EINVAL; 3905 return -EINVAL;
3917 } 3906 set_tracer_flags(1 << index, val);
3918 3907
3919 *ppos += cnt; 3908 *ppos += cnt;
3920 3909
@@ -4082,11 +4071,13 @@ static __init int tracer_init_debugfs(void)
4082 trace_create_file("current_tracer", 0644, d_tracer, 4071 trace_create_file("current_tracer", 0644, d_tracer,
4083 &global_trace, &set_tracer_fops); 4072 &global_trace, &set_tracer_fops);
4084 4073
4074#ifdef CONFIG_TRACER_MAX_TRACE
4085 trace_create_file("tracing_max_latency", 0644, d_tracer, 4075 trace_create_file("tracing_max_latency", 0644, d_tracer,
4086 &tracing_max_latency, &tracing_max_lat_fops); 4076 &tracing_max_latency, &tracing_max_lat_fops);
4087 4077
4088 trace_create_file("tracing_thresh", 0644, d_tracer, 4078 trace_create_file("tracing_thresh", 0644, d_tracer,
4089 &tracing_thresh, &tracing_max_lat_fops); 4079 &tracing_thresh, &tracing_max_lat_fops);
4080#endif
4090 4081
4091 trace_create_file("README", 0444, d_tracer, 4082 trace_create_file("README", 0444, d_tracer,
4092 NULL, &tracing_readme_fops); 4083 NULL, &tracing_readme_fops);
@@ -4103,6 +4094,9 @@ static __init int tracer_init_debugfs(void)
4103 trace_create_file("saved_cmdlines", 0444, d_tracer, 4094 trace_create_file("saved_cmdlines", 0444, d_tracer,
4104 NULL, &tracing_saved_cmdlines_fops); 4095 NULL, &tracing_saved_cmdlines_fops);
4105 4096
4097 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4098 &trace_clock_fops);
4099
4106#ifdef CONFIG_DYNAMIC_FTRACE 4100#ifdef CONFIG_DYNAMIC_FTRACE
4107 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4101 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4108 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4102 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4243,8 +4237,11 @@ static void __ftrace_dump(bool disable_tracing)
4243 iter.pos = -1; 4237 iter.pos = -1;
4244 4238
4245 if (find_next_entry_inc(&iter) != NULL) { 4239 if (find_next_entry_inc(&iter) != NULL) {
4246 print_trace_line(&iter); 4240 int ret;
4247 trace_consume(&iter); 4241
4242 ret = print_trace_line(&iter);
4243 if (ret != TRACE_TYPE_NO_CONSUME)
4244 trace_consume(&iter);
4248 } 4245 }
4249 4246
4250 trace_printk_seq(&iter.seq); 4247 trace_printk_seq(&iter.seq);
@@ -4278,7 +4275,6 @@ void ftrace_dump(void)
4278 4275
4279__init static int tracer_alloc_buffers(void) 4276__init static int tracer_alloc_buffers(void)
4280{ 4277{
4281 struct trace_array_cpu *data;
4282 int ring_buf_size; 4278 int ring_buf_size;
4283 int i; 4279 int i;
4284 int ret = -ENOMEM; 4280 int ret = -ENOMEM;
@@ -4328,7 +4324,7 @@ __init static int tracer_alloc_buffers(void)
4328 4324
4329 /* Allocate the first page for all buffers */ 4325 /* Allocate the first page for all buffers */
4330 for_each_tracing_cpu(i) { 4326 for_each_tracing_cpu(i) {
4331 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4327 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4332 max_tr.data[i] = &per_cpu(max_data, i); 4328 max_tr.data[i] = &per_cpu(max_data, i);
4333 } 4329 }
4334 4330
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ff1ef411a176..ea7e0bcbd539 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,8 +38,6 @@ enum trace_type {
38 TRACE_GRAPH_ENT, 38 TRACE_GRAPH_ENT,
39 TRACE_USER_STACK, 39 TRACE_USER_STACK,
40 TRACE_HW_BRANCHES, 40 TRACE_HW_BRANCHES,
41 TRACE_SYSCALL_ENTER,
42 TRACE_SYSCALL_EXIT,
43 TRACE_KMEM_ALLOC, 41 TRACE_KMEM_ALLOC,
44 TRACE_KMEM_FREE, 42 TRACE_KMEM_FREE,
45 TRACE_POWER, 43 TRACE_POWER,
@@ -251,9 +249,6 @@ struct trace_array_cpu {
251 atomic_t disabled; 249 atomic_t disabled;
252 void *buffer_page; /* ring buffer spare */ 250 void *buffer_page; /* ring buffer spare */
253 251
254 /* these fields get copied into max-trace: */
255 unsigned long trace_idx;
256 unsigned long overrun;
257 unsigned long saved_latency; 252 unsigned long saved_latency;
258 unsigned long critical_start; 253 unsigned long critical_start;
259 unsigned long critical_end; 254 unsigned long critical_end;
@@ -261,6 +256,7 @@ struct trace_array_cpu {
261 unsigned long nice; 256 unsigned long nice;
262 unsigned long policy; 257 unsigned long policy;
263 unsigned long rt_priority; 258 unsigned long rt_priority;
259 unsigned long skipped_entries;
264 cycle_t preempt_timestamp; 260 cycle_t preempt_timestamp;
265 pid_t pid; 261 pid_t pid;
266 uid_t uid; 262 uid_t uid;
@@ -334,10 +330,6 @@ extern void __ftrace_bad_type(void);
334 TRACE_KMEM_ALLOC); \ 330 TRACE_KMEM_ALLOC); \
335 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 331 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
336 TRACE_KMEM_FREE); \ 332 TRACE_KMEM_FREE); \
337 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
338 TRACE_SYSCALL_ENTER); \
339 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
340 TRACE_SYSCALL_EXIT); \
341 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ 333 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
342 __ftrace_bad_type(); \ 334 __ftrace_bad_type(); \
343 } while (0) 335 } while (0)
@@ -439,12 +431,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
439 431
440struct ring_buffer_event; 432struct ring_buffer_event;
441 433
442struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 434struct ring_buffer_event *
443 int type, 435trace_buffer_lock_reserve(struct ring_buffer *buffer,
444 unsigned long len, 436 int type,
445 unsigned long flags, 437 unsigned long len,
446 int pc); 438 unsigned long flags,
447void trace_buffer_unlock_commit(struct trace_array *tr, 439 int pc);
440void trace_buffer_unlock_commit(struct ring_buffer *buffer,
448 struct ring_buffer_event *event, 441 struct ring_buffer_event *event,
449 unsigned long flags, int pc); 442 unsigned long flags, int pc);
450 443
@@ -454,10 +447,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
454struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 447struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
455 int *ent_cpu, u64 *ent_ts); 448 int *ent_cpu, u64 *ent_ts);
456 449
457void tracing_generic_entry_update(struct trace_entry *entry,
458 unsigned long flags,
459 int pc);
460
461void default_wait_pipe(struct trace_iterator *iter); 450void default_wait_pipe(struct trace_iterator *iter);
462void poll_wait_pipe(struct trace_iterator *iter); 451void poll_wait_pipe(struct trace_iterator *iter);
463 452
@@ -487,6 +476,7 @@ void trace_function(struct trace_array *tr,
487 476
488void trace_graph_return(struct ftrace_graph_ret *trace); 477void trace_graph_return(struct ftrace_graph_ret *trace);
489int trace_graph_entry(struct ftrace_graph_ent *trace); 478int trace_graph_entry(struct ftrace_graph_ent *trace);
479void set_graph_array(struct trace_array *tr);
490 480
491void tracing_start_cmdline_record(void); 481void tracing_start_cmdline_record(void);
492void tracing_stop_cmdline_record(void); 482void tracing_stop_cmdline_record(void);
@@ -498,16 +488,40 @@ void unregister_tracer(struct tracer *type);
498 488
499extern unsigned long nsecs_to_usecs(unsigned long nsecs); 489extern unsigned long nsecs_to_usecs(unsigned long nsecs);
500 490
491#ifdef CONFIG_TRACER_MAX_TRACE
501extern unsigned long tracing_max_latency; 492extern unsigned long tracing_max_latency;
502extern unsigned long tracing_thresh; 493extern unsigned long tracing_thresh;
503 494
504void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 495void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
505void update_max_tr_single(struct trace_array *tr, 496void update_max_tr_single(struct trace_array *tr,
506 struct task_struct *tsk, int cpu); 497 struct task_struct *tsk, int cpu);
498#endif /* CONFIG_TRACER_MAX_TRACE */
499
500#ifdef CONFIG_STACKTRACE
501void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
502 int skip, int pc);
503
504void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
505 int pc);
507 506
508void __trace_stack(struct trace_array *tr, 507void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
509 unsigned long flags, 508 int pc);
510 int skip, int pc); 509#else
510static inline void ftrace_trace_stack(struct trace_array *tr,
511 unsigned long flags, int skip, int pc)
512{
513}
514
515static inline void ftrace_trace_userstack(struct trace_array *tr,
516 unsigned long flags, int pc)
517{
518}
519
520static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
521 int skip, int pc)
522{
523}
524#endif /* CONFIG_STACKTRACE */
511 525
512extern cycle_t ftrace_now(int cpu); 526extern cycle_t ftrace_now(int cpu);
513 527
@@ -533,6 +547,10 @@ extern unsigned long ftrace_update_tot_cnt;
533extern int DYN_FTRACE_TEST_NAME(void); 547extern int DYN_FTRACE_TEST_NAME(void);
534#endif 548#endif
535 549
550extern int ring_buffer_expanded;
551extern bool tracing_selftest_disabled;
552DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
553
536#ifdef CONFIG_FTRACE_STARTUP_TEST 554#ifdef CONFIG_FTRACE_STARTUP_TEST
537extern int trace_selftest_startup_function(struct tracer *trace, 555extern int trace_selftest_startup_function(struct tracer *trace,
538 struct trace_array *tr); 556 struct trace_array *tr);
@@ -566,9 +584,16 @@ extern int
566trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 584trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
567extern int 585extern int
568trace_vprintk(unsigned long ip, const char *fmt, va_list args); 586trace_vprintk(unsigned long ip, const char *fmt, va_list args);
587extern int
588trace_array_vprintk(struct trace_array *tr,
589 unsigned long ip, const char *fmt, va_list args);
590int trace_array_printk(struct trace_array *tr,
591 unsigned long ip, const char *fmt, ...);
569 592
570extern unsigned long trace_flags; 593extern unsigned long trace_flags;
571 594
595extern int trace_clock_id;
596
572/* Standard output formatting function used for function return traces */ 597/* Standard output formatting function used for function return traces */
573#ifdef CONFIG_FUNCTION_GRAPH_TRACER 598#ifdef CONFIG_FUNCTION_GRAPH_TRACER
574extern enum print_line_t print_graph_function(struct trace_iterator *iter); 599extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -615,6 +640,7 @@ print_graph_function(struct trace_iterator *iter)
615 640
616extern struct pid *ftrace_pid_trace; 641extern struct pid *ftrace_pid_trace;
617 642
643#ifdef CONFIG_FUNCTION_TRACER
618static inline int ftrace_trace_task(struct task_struct *task) 644static inline int ftrace_trace_task(struct task_struct *task)
619{ 645{
620 if (!ftrace_pid_trace) 646 if (!ftrace_pid_trace)
@@ -622,6 +648,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
622 648
623 return test_tsk_trace_trace(task); 649 return test_tsk_trace_trace(task);
624} 650}
651#else
652static inline int ftrace_trace_task(struct task_struct *task)
653{
654 return 1;
655}
656#endif
625 657
626/* 658/*
627 * trace_iterator_flags is an enumeration that defines bit 659 * trace_iterator_flags is an enumeration that defines bit
@@ -650,9 +682,8 @@ enum trace_iterator_flags {
650 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 682 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
651 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 683 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
652 TRACE_ITER_LATENCY_FMT = 0x40000, 684 TRACE_ITER_LATENCY_FMT = 0x40000,
653 TRACE_ITER_GLOBAL_CLK = 0x80000, 685 TRACE_ITER_SLEEP_TIME = 0x80000,
654 TRACE_ITER_SLEEP_TIME = 0x100000, 686 TRACE_ITER_GRAPH_TIME = 0x100000,
655 TRACE_ITER_GRAPH_TIME = 0x200000,
656}; 687};
657 688
658/* 689/*
@@ -749,6 +780,7 @@ struct ftrace_event_field {
749 struct list_head link; 780 struct list_head link;
750 char *name; 781 char *name;
751 char *type; 782 char *type;
783 int filter_type;
752 int offset; 784 int offset;
753 int size; 785 int size;
754 int is_signed; 786 int is_signed;
@@ -758,13 +790,15 @@ struct event_filter {
758 int n_preds; 790 int n_preds;
759 struct filter_pred **preds; 791 struct filter_pred **preds;
760 char *filter_string; 792 char *filter_string;
793 bool no_reset;
761}; 794};
762 795
763struct event_subsystem { 796struct event_subsystem {
764 struct list_head list; 797 struct list_head list;
765 const char *name; 798 const char *name;
766 struct dentry *entry; 799 struct dentry *entry;
767 void *filter; 800 struct event_filter *filter;
801 int nr_events;
768}; 802};
769 803
770struct filter_pred; 804struct filter_pred;
@@ -792,6 +826,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
792 char *filter_string); 826 char *filter_string);
793extern void print_subsystem_event_filter(struct event_subsystem *system, 827extern void print_subsystem_event_filter(struct event_subsystem *system,
794 struct trace_seq *s); 828 struct trace_seq *s);
829extern int filter_assign_type(const char *type);
795 830
796static inline int 831static inline int
797filter_check_discard(struct ftrace_event_call *call, void *rec, 832filter_check_discard(struct ftrace_event_call *call, void *rec,
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..19bfc75d467e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -132,6 +130,7 @@ struct tracer boot_tracer __read_mostly =
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
134 struct ring_buffer_event *event; 132 struct ring_buffer_event *event;
133 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 134 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 135 struct trace_array *tr = boot_trace;
137 136
@@ -144,13 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 143 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 144 preempt_disable();
146 145
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 146 buffer = tr->buffer;
147 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 148 sizeof(*entry), 0, 0);
149 if (!event) 149 if (!event)
150 goto out; 150 goto out;
151 entry = ring_buffer_event_data(event); 151 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 152 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 153 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 154 out:
155 preempt_enable(); 155 preempt_enable();
156} 156}
@@ -158,6 +158,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 159{
160 struct ring_buffer_event *event; 160 struct ring_buffer_event *event;
161 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 162 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 163 struct trace_array *tr = boot_trace;
163 164
@@ -167,13 +168,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 168 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 169 preempt_disable();
169 170
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 171 buffer = tr->buffer;
172 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 173 sizeof(*entry), 0, 0);
172 if (!event) 174 if (!event)
173 goto out; 175 goto out;
174 entry = ring_buffer_event_data(event); 176 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 177 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 178 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 179 out:
178 preempt_enable(); 180 preempt_enable();
179} 181}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)
14 14
15 mutex_lock(&event_mutex); 15 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 16 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 17 if (event->id == event_id && event->profile_enable) {
18 ret = event->profile_enable(event); 18 ret = event->profile_enable(event);
19 break; 19 break;
20 } 20 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..78b1ed230177 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,6 +17,8 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
22#define TRACE_SYSTEM "TRACE_SYSTEM" 24#define TRACE_SYSTEM "TRACE_SYSTEM"
@@ -25,8 +27,9 @@ DEFINE_MUTEX(event_mutex);
25 27
26LIST_HEAD(ftrace_events); 28LIST_HEAD(ftrace_events);
27 29
28int trace_define_field(struct ftrace_event_call *call, char *type, 30int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 31 const char *name, int offset, int size, int is_signed,
32 int filter_type)
30{ 33{
31 struct ftrace_event_field *field; 34 struct ftrace_event_field *field;
32 35
@@ -42,9 +45,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 45 if (!field->type)
43 goto err; 46 goto err;
44 47
48 if (filter_type == FILTER_OTHER)
49 field->filter_type = filter_assign_type(type);
50 else
51 field->filter_type = filter_type;
52
45 field->offset = offset; 53 field->offset = offset;
46 field->size = size; 54 field->size = size;
47 field->is_signed = is_signed; 55 field->is_signed = is_signed;
56
48 list_add(&field->link, &call->fields); 57 list_add(&field->link, &call->fields);
49 58
50 return 0; 59 return 0;
@@ -60,6 +69,29 @@ err:
60} 69}
61EXPORT_SYMBOL_GPL(trace_define_field); 70EXPORT_SYMBOL_GPL(trace_define_field);
62 71
72#define __common_field(type, item) \
73 ret = trace_define_field(call, #type, "common_" #item, \
74 offsetof(typeof(ent), item), \
75 sizeof(ent.item), \
76 is_signed_type(type), FILTER_OTHER); \
77 if (ret) \
78 return ret;
79
80int trace_define_common_fields(struct ftrace_event_call *call)
81{
82 int ret;
83 struct trace_entry ent;
84
85 __common_field(unsigned short, type);
86 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid);
89 __common_field(int, tgid);
90
91 return ret;
92}
93EXPORT_SYMBOL_GPL(trace_define_common_fields);
94
63#ifdef CONFIG_MODULES 95#ifdef CONFIG_MODULES
64 96
65static void trace_destroy_fields(struct ftrace_event_call *call) 97static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +116,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 116 if (call->enabled) {
85 call->enabled = 0; 117 call->enabled = 0;
86 tracing_stop_cmdline_record(); 118 tracing_stop_cmdline_record();
87 call->unregfunc(); 119 call->unregfunc(call->data);
88 } 120 }
89 break; 121 break;
90 case 1: 122 case 1:
91 if (!call->enabled) { 123 if (!call->enabled) {
92 call->enabled = 1; 124 call->enabled = 1;
93 tracing_start_cmdline_record(); 125 tracing_start_cmdline_record();
94 call->regfunc(); 126 call->regfunc(call->data);
95 } 127 }
96 break; 128 break;
97 } 129 }
@@ -300,10 +332,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 332
301static void *t_start(struct seq_file *m, loff_t *pos) 333static void *t_start(struct seq_file *m, loff_t *pos)
302{ 334{
335 struct ftrace_event_call *call = NULL;
336 loff_t l;
337
303 mutex_lock(&event_mutex); 338 mutex_lock(&event_mutex);
304 if (*pos == 0) 339
305 m->private = ftrace_events.next; 340 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 341 for (l = 0; l <= *pos; ) {
342 call = t_next(m, NULL, &l);
343 if (!call)
344 break;
345 }
346 return call;
307} 347}
308 348
309static void * 349static void *
@@ -332,10 +372,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 372
333static void *s_start(struct seq_file *m, loff_t *pos) 373static void *s_start(struct seq_file *m, loff_t *pos)
334{ 374{
375 struct ftrace_event_call *call = NULL;
376 loff_t l;
377
335 mutex_lock(&event_mutex); 378 mutex_lock(&event_mutex);
336 if (*pos == 0) 379
337 m->private = ftrace_events.next; 380 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 381 for (l = 0; l <= *pos; ) {
382 call = s_next(m, NULL, &l);
383 if (!call)
384 break;
385 }
386 return call;
339} 387}
340 388
341static int t_show(struct seq_file *m, void *v) 389static int t_show(struct seq_file *m, void *v)
@@ -360,7 +408,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
360 const struct seq_operations *seq_ops; 408 const struct seq_operations *seq_ops;
361 409
362 if ((file->f_mode & FMODE_WRITE) && 410 if ((file->f_mode & FMODE_WRITE) &&
363 !(file->f_flags & O_APPEND)) 411 (file->f_flags & O_TRUNC))
364 ftrace_clear_events(); 412 ftrace_clear_events();
365 413
366 seq_ops = inode->i_private; 414 seq_ops = inode->i_private;
@@ -558,7 +606,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
558 trace_seq_printf(s, "format:\n"); 606 trace_seq_printf(s, "format:\n");
559 trace_write_header(s); 607 trace_write_header(s);
560 608
561 r = call->show_format(s); 609 r = call->show_format(call, s);
562 if (!r) { 610 if (!r) {
563 /* 611 /*
564 * ug! The format output is bigger than a PAGE!! 612 * ug! The format output is bigger than a PAGE!!
@@ -833,8 +881,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
833 881
834 /* First see if we did not already create this dir */ 882 /* First see if we did not already create this dir */
835 list_for_each_entry(system, &event_subsystems, list) { 883 list_for_each_entry(system, &event_subsystems, list) {
836 if (strcmp(system->name, name) == 0) 884 if (strcmp(system->name, name) == 0) {
885 system->nr_events++;
837 return system->entry; 886 return system->entry;
887 }
838 } 888 }
839 889
840 /* need to create new entry */ 890 /* need to create new entry */
@@ -853,6 +903,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
853 return d_events; 903 return d_events;
854 } 904 }
855 905
906 system->nr_events = 1;
856 system->name = kstrdup(name, GFP_KERNEL); 907 system->name = kstrdup(name, GFP_KERNEL);
857 if (!system->name) { 908 if (!system->name) {
858 debugfs_remove(system->entry); 909 debugfs_remove(system->entry);
@@ -904,15 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
904 if (strcmp(call->system, TRACE_SYSTEM) != 0) 955 if (strcmp(call->system, TRACE_SYSTEM) != 0)
905 d_events = event_subsystem_dir(call->system, d_events); 956 d_events = event_subsystem_dir(call->system, d_events);
906 957
907 if (call->raw_init) {
908 ret = call->raw_init();
909 if (ret < 0) {
910 pr_warning("Could not initialize trace point"
911 " events/%s\n", call->name);
912 return ret;
913 }
914 }
915
916 call->dir = debugfs_create_dir(call->name, d_events); 958 call->dir = debugfs_create_dir(call->name, d_events);
917 if (!call->dir) { 959 if (!call->dir) {
918 pr_warning("Could not create debugfs " 960 pr_warning("Could not create debugfs "
@@ -924,12 +966,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
924 entry = trace_create_file("enable", 0644, call->dir, call, 966 entry = trace_create_file("enable", 0644, call->dir, call,
925 enable); 967 enable);
926 968
927 if (call->id) 969 if (call->id && call->profile_enable)
928 entry = trace_create_file("id", 0444, call->dir, call, 970 entry = trace_create_file("id", 0444, call->dir, call,
929 id); 971 id);
930 972
931 if (call->define_fields) { 973 if (call->define_fields) {
932 ret = call->define_fields(); 974 ret = call->define_fields(call);
933 if (ret < 0) { 975 if (ret < 0) {
934 pr_warning("Could not initialize trace point" 976 pr_warning("Could not initialize trace point"
935 " events/%s\n", call->name); 977 " events/%s\n", call->name);
@@ -971,6 +1013,32 @@ struct ftrace_module_file_ops {
971 struct file_operations filter; 1013 struct file_operations filter;
972}; 1014};
973 1015
1016static void remove_subsystem_dir(const char *name)
1017{
1018 struct event_subsystem *system;
1019
1020 if (strcmp(name, TRACE_SYSTEM) == 0)
1021 return;
1022
1023 list_for_each_entry(system, &event_subsystems, list) {
1024 if (strcmp(system->name, name) == 0) {
1025 if (!--system->nr_events) {
1026 struct event_filter *filter = system->filter;
1027
1028 debugfs_remove_recursive(system->entry);
1029 list_del(&system->list);
1030 if (filter) {
1031 kfree(filter->filter_string);
1032 kfree(filter);
1033 }
1034 kfree(system->name);
1035 kfree(system);
1036 }
1037 break;
1038 }
1039 }
1040}
1041
974static struct ftrace_module_file_ops * 1042static struct ftrace_module_file_ops *
975trace_create_file_ops(struct module *mod) 1043trace_create_file_ops(struct module *mod)
976{ 1044{
@@ -1011,6 +1079,7 @@ static void trace_module_add_events(struct module *mod)
1011 struct ftrace_module_file_ops *file_ops = NULL; 1079 struct ftrace_module_file_ops *file_ops = NULL;
1012 struct ftrace_event_call *call, *start, *end; 1080 struct ftrace_event_call *call, *start, *end;
1013 struct dentry *d_events; 1081 struct dentry *d_events;
1082 int ret;
1014 1083
1015 start = mod->trace_events; 1084 start = mod->trace_events;
1016 end = mod->trace_events + mod->num_trace_events; 1085 end = mod->trace_events + mod->num_trace_events;
@@ -1026,7 +1095,15 @@ static void trace_module_add_events(struct module *mod)
1026 /* The linker may leave blanks */ 1095 /* The linker may leave blanks */
1027 if (!call->name) 1096 if (!call->name)
1028 continue; 1097 continue;
1029 1098 if (call->raw_init) {
1099 ret = call->raw_init();
1100 if (ret < 0) {
1101 if (ret != -ENOSYS)
1102 pr_warning("Could not initialize trace "
1103 "point events/%s\n", call->name);
1104 continue;
1105 }
1106 }
1030 /* 1107 /*
1031 * This module has events, create file ops for this module 1108 * This module has events, create file ops for this module
1032 * if not already done. 1109 * if not already done.
@@ -1061,6 +1138,7 @@ static void trace_module_remove_events(struct module *mod)
1061 list_del(&call->list); 1138 list_del(&call->list);
1062 trace_destroy_fields(call); 1139 trace_destroy_fields(call);
1063 destroy_preds(call); 1140 destroy_preds(call);
1141 remove_subsystem_dir(call->system);
1064 } 1142 }
1065 } 1143 }
1066 1144
@@ -1117,6 +1195,18 @@ struct notifier_block trace_module_nb = {
1117extern struct ftrace_event_call __start_ftrace_events[]; 1195extern struct ftrace_event_call __start_ftrace_events[];
1118extern struct ftrace_event_call __stop_ftrace_events[]; 1196extern struct ftrace_event_call __stop_ftrace_events[];
1119 1197
1198static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1199
1200static __init int setup_trace_event(char *str)
1201{
1202 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1203 ring_buffer_expanded = 1;
1204 tracing_selftest_disabled = 1;
1205
1206 return 1;
1207}
1208__setup("trace_event=", setup_trace_event);
1209
1120static __init int event_trace_init(void) 1210static __init int event_trace_init(void)
1121{ 1211{
1122 struct ftrace_event_call *call; 1212 struct ftrace_event_call *call;
@@ -1124,6 +1214,8 @@ static __init int event_trace_init(void)
1124 struct dentry *entry; 1214 struct dentry *entry;
1125 struct dentry *d_events; 1215 struct dentry *d_events;
1126 int ret; 1216 int ret;
1217 char *buf = bootup_event_buf;
1218 char *token;
1127 1219
1128 d_tracer = tracing_init_dentry(); 1220 d_tracer = tracing_init_dentry();
1129 if (!d_tracer) 1221 if (!d_tracer)
@@ -1163,12 +1255,34 @@ static __init int event_trace_init(void)
1163 /* The linker may leave blanks */ 1255 /* The linker may leave blanks */
1164 if (!call->name) 1256 if (!call->name)
1165 continue; 1257 continue;
1258 if (call->raw_init) {
1259 ret = call->raw_init();
1260 if (ret < 0) {
1261 if (ret != -ENOSYS)
1262 pr_warning("Could not initialize trace "
1263 "point events/%s\n", call->name);
1264 continue;
1265 }
1266 }
1166 list_add(&call->list, &ftrace_events); 1267 list_add(&call->list, &ftrace_events);
1167 event_create_dir(call, d_events, &ftrace_event_id_fops, 1268 event_create_dir(call, d_events, &ftrace_event_id_fops,
1168 &ftrace_enable_fops, &ftrace_event_filter_fops, 1269 &ftrace_enable_fops, &ftrace_event_filter_fops,
1169 &ftrace_event_format_fops); 1270 &ftrace_event_format_fops);
1170 } 1271 }
1171 1272
1273 while (true) {
1274 token = strsep(&buf, ",");
1275
1276 if (!token)
1277 break;
1278 if (!*token)
1279 continue;
1280
1281 ret = ftrace_set_clr_event(token, 1);
1282 if (ret)
1283 pr_warning("Failed to enable trace event: %s\n", token);
1284 }
1285
1172 ret = register_module_notifier(&trace_module_nb); 1286 ret = register_module_notifier(&trace_module_nb);
1173 if (ret) 1287 if (ret)
1174 pr_warning("Failed to register trace events module notifier\n"); 1288 pr_warning("Failed to register trace events module notifier\n");
@@ -1324,6 +1438,7 @@ static void
1324function_test_events_call(unsigned long ip, unsigned long parent_ip) 1438function_test_events_call(unsigned long ip, unsigned long parent_ip)
1325{ 1439{
1326 struct ring_buffer_event *event; 1440 struct ring_buffer_event *event;
1441 struct ring_buffer *buffer;
1327 struct ftrace_entry *entry; 1442 struct ftrace_entry *entry;
1328 unsigned long flags; 1443 unsigned long flags;
1329 long disabled; 1444 long disabled;
@@ -1341,7 +1456,8 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341 1456
1342 local_save_flags(flags); 1457 local_save_flags(flags);
1343 1458
1344 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1459 event = trace_current_buffer_lock_reserve(&buffer,
1460 TRACE_FN, sizeof(*entry),
1345 flags, pc); 1461 flags, pc);
1346 if (!event) 1462 if (!event)
1347 goto out; 1463 goto out;
@@ -1349,7 +1465,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1349 entry->ip = ip; 1465 entry->ip = ip;
1350 entry->parent_ip = parent_ip; 1466 entry->parent_ip = parent_ip;
1351 1467
1352 trace_nowake_buffer_unlock_commit(event, flags, pc); 1468 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1353 1469
1354 out: 1470 out:
1355 atomic_dec(&per_cpu(test_event_disable, cpu)); 1471 atomic_dec(&per_cpu(test_event_disable, cpu));
@@ -1376,10 +1492,10 @@ static __init void event_trace_self_test_with_function(void)
1376 1492
1377static __init int event_trace_self_tests_init(void) 1493static __init int event_trace_self_tests_init(void)
1378{ 1494{
1379 1495 if (!tracing_selftest_disabled) {
1380 event_trace_self_tests(); 1496 event_trace_self_tests();
1381 1497 event_trace_self_test_with_function();
1382 event_trace_self_test_with_function(); 1498 }
1383 1499
1384 return 0; 1500 return 0;
1385} 1501}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index db6e54bdb596..93660fbbf629 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,8 +27,6 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30static DEFINE_MUTEX(filter_mutex);
31
32enum filter_op_ids 30enum filter_op_ids
33{ 31{
34 OP_OR, 32 OP_OR,
@@ -165,6 +163,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
165 return match; 163 return match;
166} 164}
167 165
166/* Filter predicate for char * pointers */
167static int filter_pred_pchar(struct filter_pred *pred, void *event,
168 int val1, int val2)
169{
170 char **addr = (char **)(event + pred->offset);
171 int cmp, match;
172
173 cmp = strncmp(*addr, pred->str_val, pred->str_len);
174
175 match = (!cmp) ^ pred->not;
176
177 return match;
178}
179
168/* 180/*
169 * Filter predicate for dynamic sized arrays of characters. 181 * Filter predicate for dynamic sized arrays of characters.
170 * These are implemented through a list of strings at the end 182 * These are implemented through a list of strings at the end
@@ -178,11 +190,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
178static int filter_pred_strloc(struct filter_pred *pred, void *event, 190static int filter_pred_strloc(struct filter_pred *pred, void *event,
179 int val1, int val2) 191 int val1, int val2)
180{ 192{
181 int str_loc = *(int *)(event + pred->offset); 193 u32 str_item = *(u32 *)(event + pred->offset);
194 int str_loc = str_item & 0xffff;
195 int str_len = str_item >> 16;
182 char *addr = (char *)(event + str_loc); 196 char *addr = (char *)(event + str_loc);
183 int cmp, match; 197 int cmp, match;
184 198
185 cmp = strncmp(addr, pred->str_val, pred->str_len); 199 cmp = strncmp(addr, pred->str_val, str_len);
186 200
187 match = (!cmp) ^ pred->not; 201 match = (!cmp) ^ pred->not;
188 202
@@ -294,12 +308,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
294{ 308{
295 struct event_filter *filter = call->filter; 309 struct event_filter *filter = call->filter;
296 310
297 mutex_lock(&filter_mutex); 311 mutex_lock(&event_mutex);
298 if (filter->filter_string) 312 if (filter && filter->filter_string)
299 trace_seq_printf(s, "%s\n", filter->filter_string); 313 trace_seq_printf(s, "%s\n", filter->filter_string);
300 else 314 else
301 trace_seq_printf(s, "none\n"); 315 trace_seq_printf(s, "none\n");
302 mutex_unlock(&filter_mutex); 316 mutex_unlock(&event_mutex);
303} 317}
304 318
305void print_subsystem_event_filter(struct event_subsystem *system, 319void print_subsystem_event_filter(struct event_subsystem *system,
@@ -307,12 +321,12 @@ void print_subsystem_event_filter(struct event_subsystem *system,
307{ 321{
308 struct event_filter *filter = system->filter; 322 struct event_filter *filter = system->filter;
309 323
310 mutex_lock(&filter_mutex); 324 mutex_lock(&event_mutex);
311 if (filter->filter_string) 325 if (filter && filter->filter_string)
312 trace_seq_printf(s, "%s\n", filter->filter_string); 326 trace_seq_printf(s, "%s\n", filter->filter_string);
313 else 327 else
314 trace_seq_printf(s, "none\n"); 328 trace_seq_printf(s, "none\n");
315 mutex_unlock(&filter_mutex); 329 mutex_unlock(&event_mutex);
316} 330}
317 331
318static struct ftrace_event_field * 332static struct ftrace_event_field *
@@ -376,26 +390,32 @@ void destroy_preds(struct ftrace_event_call *call)
376 struct event_filter *filter = call->filter; 390 struct event_filter *filter = call->filter;
377 int i; 391 int i;
378 392
393 if (!filter)
394 return;
395
379 for (i = 0; i < MAX_FILTER_PRED; i++) { 396 for (i = 0; i < MAX_FILTER_PRED; i++) {
380 if (filter->preds[i]) 397 if (filter->preds[i])
381 filter_free_pred(filter->preds[i]); 398 filter_free_pred(filter->preds[i]);
382 } 399 }
383 kfree(filter->preds); 400 kfree(filter->preds);
401 kfree(filter->filter_string);
384 kfree(filter); 402 kfree(filter);
385 call->filter = NULL; 403 call->filter = NULL;
386} 404}
387 405
388int init_preds(struct ftrace_event_call *call) 406static int init_preds(struct ftrace_event_call *call)
389{ 407{
390 struct event_filter *filter; 408 struct event_filter *filter;
391 struct filter_pred *pred; 409 struct filter_pred *pred;
392 int i; 410 int i;
393 411
412 if (call->filter)
413 return 0;
414
394 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 415 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
395 if (!call->filter) 416 if (!call->filter)
396 return -ENOMEM; 417 return -ENOMEM;
397 418
398 call->filter_active = 0;
399 filter->n_preds = 0; 419 filter->n_preds = 0;
400 420
401 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 421 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -417,33 +437,56 @@ oom:
417 437
418 return -ENOMEM; 438 return -ENOMEM;
419} 439}
420EXPORT_SYMBOL_GPL(init_preds);
421 440
422static void filter_free_subsystem_preds(struct event_subsystem *system) 441static int init_subsystem_preds(struct event_subsystem *system)
423{ 442{
424 struct event_filter *filter = system->filter;
425 struct ftrace_event_call *call; 443 struct ftrace_event_call *call;
426 int i; 444 int err;
427 445
428 if (filter->n_preds) { 446 list_for_each_entry(call, &ftrace_events, list) {
429 for (i = 0; i < filter->n_preds; i++) 447 if (!call->define_fields)
430 filter_free_pred(filter->preds[i]); 448 continue;
431 kfree(filter->preds); 449
432 filter->preds = NULL; 450 if (strcmp(call->system, system->name) != 0)
433 filter->n_preds = 0; 451 continue;
452
453 err = init_preds(call);
454 if (err)
455 return err;
434 } 456 }
435 457
436 mutex_lock(&event_mutex); 458 return 0;
459}
460
461enum {
462 FILTER_DISABLE_ALL,
463 FILTER_INIT_NO_RESET,
464 FILTER_SKIP_NO_RESET,
465};
466
467static void filter_free_subsystem_preds(struct event_subsystem *system,
468 int flag)
469{
470 struct ftrace_event_call *call;
471
437 list_for_each_entry(call, &ftrace_events, list) { 472 list_for_each_entry(call, &ftrace_events, list) {
438 if (!call->define_fields) 473 if (!call->define_fields)
439 continue; 474 continue;
440 475
441 if (!strcmp(call->system, system->name)) { 476 if (strcmp(call->system, system->name) != 0)
442 filter_disable_preds(call); 477 continue;
443 remove_filter_string(call->filter); 478
479 if (flag == FILTER_INIT_NO_RESET) {
480 call->filter->no_reset = false;
481 continue;
444 } 482 }
483
484 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
485 continue;
486
487 filter_disable_preds(call);
488 remove_filter_string(call->filter);
445 } 489 }
446 mutex_unlock(&event_mutex);
447} 490}
448 491
449static int filter_add_pred_fn(struct filter_parse_state *ps, 492static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -471,12 +514,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
471 return 0; 514 return 0;
472} 515}
473 516
474enum { 517int filter_assign_type(const char *type)
475 FILTER_STATIC_STRING = 1,
476 FILTER_DYN_STRING
477};
478
479static int is_string_field(const char *type)
480{ 518{
481 if (strstr(type, "__data_loc") && strstr(type, "char")) 519 if (strstr(type, "__data_loc") && strstr(type, "char"))
482 return FILTER_DYN_STRING; 520 return FILTER_DYN_STRING;
@@ -484,12 +522,19 @@ static int is_string_field(const char *type)
484 if (strchr(type, '[') && strstr(type, "char")) 522 if (strchr(type, '[') && strstr(type, "char"))
485 return FILTER_STATIC_STRING; 523 return FILTER_STATIC_STRING;
486 524
487 return 0; 525 return FILTER_OTHER;
526}
527
528static bool is_string_field(struct ftrace_event_field *field)
529{
530 return field->filter_type == FILTER_DYN_STRING ||
531 field->filter_type == FILTER_STATIC_STRING ||
532 field->filter_type == FILTER_PTR_STRING;
488} 533}
489 534
490static int is_legal_op(struct ftrace_event_field *field, int op) 535static int is_legal_op(struct ftrace_event_field *field, int op)
491{ 536{
492 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 537 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
493 return 0; 538 return 0;
494 539
495 return 1; 540 return 1;
@@ -540,21 +585,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
540 585
541static int filter_add_pred(struct filter_parse_state *ps, 586static int filter_add_pred(struct filter_parse_state *ps,
542 struct ftrace_event_call *call, 587 struct ftrace_event_call *call,
543 struct filter_pred *pred) 588 struct filter_pred *pred,
589 bool dry_run)
544{ 590{
545 struct ftrace_event_field *field; 591 struct ftrace_event_field *field;
546 filter_pred_fn_t fn; 592 filter_pred_fn_t fn;
547 unsigned long long val; 593 unsigned long long val;
548 int string_type; 594 int ret;
549 595
550 pred->fn = filter_pred_none; 596 pred->fn = filter_pred_none;
551 597
552 if (pred->op == OP_AND) { 598 if (pred->op == OP_AND) {
553 pred->pop_n = 2; 599 pred->pop_n = 2;
554 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 600 fn = filter_pred_and;
601 goto add_pred_fn;
555 } else if (pred->op == OP_OR) { 602 } else if (pred->op == OP_OR) {
556 pred->pop_n = 2; 603 pred->pop_n = 2;
557 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 604 fn = filter_pred_or;
605 goto add_pred_fn;
558 } 606 }
559 607
560 field = find_event_field(call, pred->field_name); 608 field = find_event_field(call, pred->field_name);
@@ -570,62 +618,55 @@ static int filter_add_pred(struct filter_parse_state *ps,
570 return -EINVAL; 618 return -EINVAL;
571 } 619 }
572 620
573 string_type = is_string_field(field->type); 621 if (is_string_field(field)) {
574 if (string_type) { 622 pred->str_len = field->size;
575 if (string_type == FILTER_STATIC_STRING) 623
624 if (field->filter_type == FILTER_STATIC_STRING)
576 fn = filter_pred_string; 625 fn = filter_pred_string;
577 else 626 else if (field->filter_type == FILTER_DYN_STRING)
578 fn = filter_pred_strloc; 627 fn = filter_pred_strloc;
579 pred->str_len = field->size; 628 else {
580 if (pred->op == OP_NE) 629 fn = filter_pred_pchar;
581 pred->not = 1; 630 pred->str_len = strlen(pred->str_val);
582 return filter_add_pred_fn(ps, call, pred, fn); 631 }
583 } else { 632 } else {
584 if (strict_strtoull(pred->str_val, 0, &val)) { 633 if (field->is_signed)
634 ret = strict_strtoll(pred->str_val, 0, &val);
635 else
636 ret = strict_strtoull(pred->str_val, 0, &val);
637 if (ret) {
585 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 638 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
586 return -EINVAL; 639 return -EINVAL;
587 } 640 }
588 pred->val = val; 641 pred->val = val;
589 }
590 642
591 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 643 fn = select_comparison_fn(pred->op, field->size,
592 if (!fn) { 644 field->is_signed);
593 parse_error(ps, FILT_ERR_INVALID_OP, 0); 645 if (!fn) {
594 return -EINVAL; 646 parse_error(ps, FILT_ERR_INVALID_OP, 0);
647 return -EINVAL;
648 }
595 } 649 }
596 650
597 if (pred->op == OP_NE) 651 if (pred->op == OP_NE)
598 pred->not = 1; 652 pred->not = 1;
599 653
600 return filter_add_pred_fn(ps, call, pred, fn); 654add_pred_fn:
655 if (!dry_run)
656 return filter_add_pred_fn(ps, call, pred, fn);
657 return 0;
601} 658}
602 659
603static int filter_add_subsystem_pred(struct filter_parse_state *ps, 660static int filter_add_subsystem_pred(struct filter_parse_state *ps,
604 struct event_subsystem *system, 661 struct event_subsystem *system,
605 struct filter_pred *pred, 662 struct filter_pred *pred,
606 char *filter_string) 663 char *filter_string,
664 bool dry_run)
607{ 665{
608 struct event_filter *filter = system->filter;
609 struct ftrace_event_call *call; 666 struct ftrace_event_call *call;
610 int err = 0; 667 int err = 0;
668 bool fail = true;
611 669
612 if (!filter->preds) {
613 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
614 GFP_KERNEL);
615
616 if (!filter->preds)
617 return -ENOMEM;
618 }
619
620 if (filter->n_preds == MAX_FILTER_PRED) {
621 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
622 return -ENOSPC;
623 }
624
625 filter->preds[filter->n_preds] = pred;
626 filter->n_preds++;
627
628 mutex_lock(&event_mutex);
629 list_for_each_entry(call, &ftrace_events, list) { 670 list_for_each_entry(call, &ftrace_events, list) {
630 671
631 if (!call->define_fields) 672 if (!call->define_fields)
@@ -634,18 +675,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
634 if (strcmp(call->system, system->name)) 675 if (strcmp(call->system, system->name))
635 continue; 676 continue;
636 677
637 err = filter_add_pred(ps, call, pred); 678 if (call->filter->no_reset)
638 if (err) { 679 continue;
639 mutex_unlock(&event_mutex); 680
640 filter_free_subsystem_preds(system); 681 err = filter_add_pred(ps, call, pred, dry_run);
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 682 if (err)
642 goto out; 683 call->filter->no_reset = true;
643 } 684 else
644 replace_filter_string(call->filter, filter_string); 685 fail = false;
686
687 if (!dry_run)
688 replace_filter_string(call->filter, filter_string);
645 } 689 }
646 mutex_unlock(&event_mutex); 690
647out: 691 if (fail) {
648 return err; 692 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
693 return err;
694 }
695 return 0;
649} 696}
650 697
651static void parse_init(struct filter_parse_state *ps, 698static void parse_init(struct filter_parse_state *ps,
@@ -1004,12 +1051,14 @@ static int check_preds(struct filter_parse_state *ps)
1004static int replace_preds(struct event_subsystem *system, 1051static int replace_preds(struct event_subsystem *system,
1005 struct ftrace_event_call *call, 1052 struct ftrace_event_call *call,
1006 struct filter_parse_state *ps, 1053 struct filter_parse_state *ps,
1007 char *filter_string) 1054 char *filter_string,
1055 bool dry_run)
1008{ 1056{
1009 char *operand1 = NULL, *operand2 = NULL; 1057 char *operand1 = NULL, *operand2 = NULL;
1010 struct filter_pred *pred; 1058 struct filter_pred *pred;
1011 struct postfix_elt *elt; 1059 struct postfix_elt *elt;
1012 int err; 1060 int err;
1061 int n_preds = 0;
1013 1062
1014 err = check_preds(ps); 1063 err = check_preds(ps);
1015 if (err) 1064 if (err)
@@ -1028,19 +1077,14 @@ static int replace_preds(struct event_subsystem *system,
1028 continue; 1077 continue;
1029 } 1078 }
1030 1079
1080 if (n_preds++ == MAX_FILTER_PRED) {
1081 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1082 return -ENOSPC;
1083 }
1084
1031 if (elt->op == OP_AND || elt->op == OP_OR) { 1085 if (elt->op == OP_AND || elt->op == OP_OR) {
1032 pred = create_logical_pred(elt->op); 1086 pred = create_logical_pred(elt->op);
1033 if (call) { 1087 goto add_pred;
1034 err = filter_add_pred(ps, call, pred);
1035 filter_free_pred(pred);
1036 } else
1037 err = filter_add_subsystem_pred(ps, system,
1038 pred, filter_string);
1039 if (err)
1040 return err;
1041
1042 operand1 = operand2 = NULL;
1043 continue;
1044 } 1088 }
1045 1089
1046 if (!operand1 || !operand2) { 1090 if (!operand1 || !operand2) {
@@ -1049,12 +1093,15 @@ static int replace_preds(struct event_subsystem *system,
1049 } 1093 }
1050 1094
1051 pred = create_pred(elt->op, operand1, operand2); 1095 pred = create_pred(elt->op, operand1, operand2);
1052 if (call) { 1096add_pred:
1053 err = filter_add_pred(ps, call, pred); 1097 if (!pred)
1054 filter_free_pred(pred); 1098 return -ENOMEM;
1055 } else 1099 if (call)
1100 err = filter_add_pred(ps, call, pred, false);
1101 else
1056 err = filter_add_subsystem_pred(ps, system, pred, 1102 err = filter_add_subsystem_pred(ps, system, pred,
1057 filter_string); 1103 filter_string, dry_run);
1104 filter_free_pred(pred);
1058 if (err) 1105 if (err)
1059 return err; 1106 return err;
1060 1107
@@ -1070,12 +1117,16 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1070 1117
1071 struct filter_parse_state *ps; 1118 struct filter_parse_state *ps;
1072 1119
1073 mutex_lock(&filter_mutex); 1120 mutex_lock(&event_mutex);
1121
1122 err = init_preds(call);
1123 if (err)
1124 goto out_unlock;
1074 1125
1075 if (!strcmp(strstrip(filter_string), "0")) { 1126 if (!strcmp(strstrip(filter_string), "0")) {
1076 filter_disable_preds(call); 1127 filter_disable_preds(call);
1077 remove_filter_string(call->filter); 1128 remove_filter_string(call->filter);
1078 mutex_unlock(&filter_mutex); 1129 mutex_unlock(&event_mutex);
1079 return 0; 1130 return 0;
1080 } 1131 }
1081 1132
@@ -1094,7 +1145,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1094 goto out; 1145 goto out;
1095 } 1146 }
1096 1147
1097 err = replace_preds(NULL, call, ps, filter_string); 1148 err = replace_preds(NULL, call, ps, filter_string, false);
1098 if (err) 1149 if (err)
1099 append_filter_err(ps, call->filter); 1150 append_filter_err(ps, call->filter);
1100 1151
@@ -1103,7 +1154,7 @@ out:
1103 postfix_clear(ps); 1154 postfix_clear(ps);
1104 kfree(ps); 1155 kfree(ps);
1105out_unlock: 1156out_unlock:
1106 mutex_unlock(&filter_mutex); 1157 mutex_unlock(&event_mutex);
1107 1158
1108 return err; 1159 return err;
1109} 1160}
@@ -1115,12 +1166,16 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1115 1166
1116 struct filter_parse_state *ps; 1167 struct filter_parse_state *ps;
1117 1168
1118 mutex_lock(&filter_mutex); 1169 mutex_lock(&event_mutex);
1170
1171 err = init_subsystem_preds(system);
1172 if (err)
1173 goto out_unlock;
1119 1174
1120 if (!strcmp(strstrip(filter_string), "0")) { 1175 if (!strcmp(strstrip(filter_string), "0")) {
1121 filter_free_subsystem_preds(system); 1176 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1122 remove_filter_string(system->filter); 1177 remove_filter_string(system->filter);
1123 mutex_unlock(&filter_mutex); 1178 mutex_unlock(&event_mutex);
1124 return 0; 1179 return 0;
1125 } 1180 }
1126 1181
@@ -1129,7 +1184,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1129 if (!ps) 1184 if (!ps)
1130 goto out_unlock; 1185 goto out_unlock;
1131 1186
1132 filter_free_subsystem_preds(system);
1133 replace_filter_string(system->filter, filter_string); 1187 replace_filter_string(system->filter, filter_string);
1134 1188
1135 parse_init(ps, filter_ops, filter_string); 1189 parse_init(ps, filter_ops, filter_string);
@@ -1139,16 +1193,30 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1139 goto out; 1193 goto out;
1140 } 1194 }
1141 1195
1142 err = replace_preds(system, NULL, ps, filter_string); 1196 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1143 if (err) 1197
1198 /* try to see the filter can be applied to which events */
1199 err = replace_preds(system, NULL, ps, filter_string, true);
1200 if (err) {
1144 append_filter_err(ps, system->filter); 1201 append_filter_err(ps, system->filter);
1202 goto out;
1203 }
1204
1205 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1206
1207 /* really apply the filter to the events */
1208 err = replace_preds(system, NULL, ps, filter_string, false);
1209 if (err) {
1210 append_filter_err(ps, system->filter);
1211 filter_free_subsystem_preds(system, 2);
1212 }
1145 1213
1146out: 1214out:
1147 filter_opstack_clear(ps); 1215 filter_opstack_clear(ps);
1148 postfix_clear(ps); 1216 postfix_clear(ps);
1149 kfree(ps); 1217 kfree(ps);
1150out_unlock: 1218out_unlock:
1151 mutex_unlock(&filter_mutex); 1219 mutex_unlock(&event_mutex);
1152 1220
1153 return err; 1221 return err;
1154} 1222}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..df1bf6e48bb9 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -60,7 +60,8 @@ extern void __bad_type_size(void);
60#undef TRACE_EVENT_FORMAT 60#undef TRACE_EVENT_FORMAT
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
62static int \ 62static int \
63ftrace_format_##call(struct trace_seq *s) \ 63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
64{ \ 65{ \
65 struct args field; \ 66 struct args field; \
66 int ret; \ 67 int ret; \
@@ -76,7 +77,8 @@ ftrace_format_##call(struct trace_seq *s) \
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
77 tpfmt) \ 78 tpfmt) \
78static int \ 79static int \
79ftrace_format_##call(struct trace_seq *s) \ 80ftrace_format_##call(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \
80{ \ 82{ \
81 struct args field; \ 83 struct args field; \
82 int ret; \ 84 int ret; \
@@ -117,7 +119,7 @@ ftrace_format_##call(struct trace_seq *s) \
117 119
118#undef TRACE_EVENT_FORMAT 120#undef TRACE_EVENT_FORMAT
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
120int ftrace_define_fields_##call(void); \ 122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
121static int ftrace_raw_init_event_##call(void); \ 123static int ftrace_raw_init_event_##call(void); \
122 \ 124 \
123struct ftrace_event_call __used \ 125struct ftrace_event_call __used \
@@ -133,7 +135,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 135static int ftrace_raw_init_event_##call(void) \
134{ \ 136{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 137 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 138 return 0; \
138} \ 139} \
139 140
@@ -156,7 +157,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
156#define TRACE_FIELD(type, item, assign) \ 157#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \ 158 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \ 159 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \ 160 sizeof(field.item), \
161 is_signed_type(type), FILTER_OTHER); \
160 if (ret) \ 162 if (ret) \
161 return ret; 163 return ret;
162 164
@@ -164,7 +166,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \ 168 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \ 169 sizeof(field.item), 0, FILTER_OTHER); \
168 if (ret) \ 170 if (ret) \
169 return ret; 171 return ret;
170 172
@@ -172,7 +174,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \ 175 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \ 176 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \ 177 sizeof(field.item), is_signed, \
178 FILTER_OTHER); \
176 if (ret) \ 179 if (ret) \
177 return ret; 180 return ret;
178 181
@@ -182,17 +185,14 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
182#undef TRACE_EVENT_FORMAT 185#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \ 187int \
185ftrace_define_fields_##call(void) \ 188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
186{ \ 189{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \ 190 struct args field; \
189 int ret; \ 191 int ret; \
190 \ 192 \
191 __common_field(unsigned char, type, 0); \ 193 ret = trace_define_common_fields(event_call); \
192 __common_field(unsigned char, flags, 0); \ 194 if (ret) \
193 __common_field(unsigned char, preempt_count, 0); \ 195 return ret; \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \ 196 \
197 tstruct; \ 197 tstruct; \
198 \ 198 \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..5b01b94518fc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
193static void tracing_stop_function_trace(void) 193static void tracing_stop_function_trace(void)
194{ 194{
195 ftrace_function_enabled = 0; 195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */ 196
197 unregister_ftrace_function(&trace_stack_ops); 197 if (func_flags.val & TRACE_FUNC_OPT_STACK)
198 unregister_ftrace_function(&trace_ops); 198 unregister_ftrace_function(&trace_stack_ops);
199 else
200 unregister_ftrace_function(&trace_ops);
199} 201}
200 202
201static int func_set_flag(u32 old_flags, u32 bit, int set) 203static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -286,11 +288,9 @@ static int
286ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
287 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
288{ 290{
289 char str[KSYM_SYMBOL_LEN];
290 long count = (long)data; 291 long count = (long)data;
291 292
292 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%pf:", (void *)ip);
293 seq_printf(m, "%s:", str);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
@@ -300,8 +300,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
300 if (count == -1) 300 if (count == -1)
301 seq_printf(m, ":unlimited\n"); 301 seq_printf(m, ":unlimited\n");
302 else 302 else
303 seq_printf(m, ":count=%ld", count); 303 seq_printf(m, ":count=%ld\n", count);
304 seq_putc(m, '\n');
305 304
306 return 0; 305 return 0;
307} 306}
@@ -362,7 +361,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
362 out_reg: 361 out_reg:
363 ret = register_ftrace_function_probe(glob, ops, count); 362 ret = register_ftrace_function_probe(glob, ops, count);
364 363
365 return ret; 364 return ret < 0 ? ret : 0;
366} 365}
367 366
368static struct ftrace_func_command ftrace_traceon_cmd = { 367static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b2..b3749a2c3132 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,12 +52,13 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
59int 59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
61 unsigned long frame_pointer)
61{ 62{
62 unsigned long long calltime; 63 unsigned long long calltime;
63 int index; 64 int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
85 current->ret_stack[index].func = func; 86 current->ret_stack[index].func = func;
86 current->ret_stack[index].calltime = calltime; 87 current->ret_stack[index].calltime = calltime;
87 current->ret_stack[index].subtime = 0; 88 current->ret_stack[index].subtime = 0;
89 current->ret_stack[index].fp = frame_pointer;
88 *depth = index; 90 *depth = index;
89 91
90 return 0; 92 return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
92 94
93/* Retrieve a function return address to the trace stack on thread info.*/ 95/* Retrieve a function return address to the trace stack on thread info.*/
94static void 96static void
95ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 97ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
98 unsigned long frame_pointer)
96{ 99{
97 int index; 100 int index;
98 101
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
106 return; 109 return;
107 } 110 }
108 111
112#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
113 /*
114 * The arch may choose to record the frame pointer used
115 * and check it here to make sure that it is what we expect it
116 * to be. If gcc does not set the place holder of the return
117 * address in the frame pointer, and does a copy instead, then
118 * the function graph trace will fail. This test detects this
119 * case.
120 *
121 * Currently, x86_32 with optimize for size (-Os) makes the latest
122 * gcc do the above.
123 */
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n",
128 current->ret_stack[index].fp,
129 frame_pointer,
130 (void *)current->ret_stack[index].func,
131 current->ret_stack[index].ret);
132 *ret = (unsigned long)panic;
133 return;
134 }
135#endif
136
109 *ret = current->ret_stack[index].ret; 137 *ret = current->ret_stack[index].ret;
110 trace->func = current->ret_stack[index].func; 138 trace->func = current->ret_stack[index].func;
111 trace->calltime = current->ret_stack[index].calltime; 139 trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
117 * Send the trace to the ring-buffer. 145 * Send the trace to the ring-buffer.
118 * @return the original return address. 146 * @return the original return address.
119 */ 147 */
120unsigned long ftrace_return_to_handler(void) 148unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
121{ 149{
122 struct ftrace_graph_ret trace; 150 struct ftrace_graph_ret trace;
123 unsigned long ret; 151 unsigned long ret;
124 152
125 ftrace_pop_return_trace(&trace, &ret); 153 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
126 trace.rettime = trace_clock_local(); 154 trace.rettime = trace_clock_local();
127 ftrace_graph_return(&trace); 155 ftrace_graph_return(&trace);
128 barrier(); 156 barrier();
@@ -138,10 +166,123 @@ unsigned long ftrace_return_to_handler(void)
138 return ret; 166 return ret;
139} 167}
140 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
141static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
142{ 280{
143 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
144 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
145 if (ret) 286 if (ret)
146 return ret; 287 return ret;
147 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -149,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
149 return 0; 290 return 0;
150} 291}
151 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
152static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
153{ 299{
154 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
155 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
156} 302}
157 303
158static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
159{
160 if (nb / 100)
161 return 3;
162 if (nb / 10)
163 return 2;
164 return 1;
165}
166 305
167static enum print_line_t 306static enum print_line_t
168print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
169{ 308{
170 int i;
171 int ret; 309 int ret;
172 int log10_this = log10_cpu(cpu);
173 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
174
175 310
176 /* 311 /*
177 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
178 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
179 * email: 314 * email:
180 */ 315 */
181 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
182
183 /*
184 * Tricky - we space the CPU field according to the max
185 * number of online CPUs. On a 2-cpu system it would take
186 * a maximum of 1 digit - on a 128 cpu system it would
187 * take up to 3 digits:
188 */
189 for (i = 0; i < log10_all - log10_this; i++) {
190 ret = trace_seq_printf(s, " ");
191 if (!ret)
192 return TRACE_TYPE_PARTIAL_LINE;
193 }
194 ret = trace_seq_printf(s, "%d) ", cpu);
195 if (!ret) 317 if (!ret)
196 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
197 319
@@ -537,11 +659,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
537 return TRACE_TYPE_PARTIAL_LINE; 659 return TRACE_TYPE_PARTIAL_LINE;
538 } 660 }
539 661
540 ret = seq_print_ip_sym(s, call->func, 0); 662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
541 if (!ret)
542 return TRACE_TYPE_PARTIAL_LINE;
543
544 ret = trace_seq_printf(s, "();\n");
545 if (!ret) 663 if (!ret)
546 return TRACE_TYPE_PARTIAL_LINE; 664 return TRACE_TYPE_PARTIAL_LINE;
547 665
@@ -584,11 +702,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
584 return TRACE_TYPE_PARTIAL_LINE; 702 return TRACE_TYPE_PARTIAL_LINE;
585 } 703 }
586 704
587 ret = seq_print_ip_sym(s, call->func, 0); 705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
588 if (!ret)
589 return TRACE_TYPE_PARTIAL_LINE;
590
591 ret = trace_seq_printf(s, "() {\n");
592 if (!ret) 706 if (!ret)
593 return TRACE_TYPE_PARTIAL_LINE; 707 return TRACE_TYPE_PARTIAL_LINE;
594 708
@@ -815,9 +929,16 @@ print_graph_function(struct trace_iterator *iter)
815 929
816 switch (entry->type) { 930 switch (entry->type) {
817 case TRACE_GRAPH_ENT: { 931 case TRACE_GRAPH_ENT: {
818 struct ftrace_graph_ent_entry *field; 932 /*
933 * print_graph_entry() may consume the current event,
934 * thus @field may become invalid, so we need to save it.
935 * sizeof(struct ftrace_graph_ent_entry) is very small,
936 * it can be safely saved at the stack.
937 */
938 struct ftrace_graph_ent_entry *field, saved;
819 trace_assign_type(field, entry); 939 trace_assign_type(field, entry);
820 return print_graph_entry(field, s, iter); 940 saved = *field;
941 return print_graph_entry(&saved, s, iter);
821 } 942 }
822 case TRACE_GRAPH_RET: { 943 case TRACE_GRAPH_RET: {
823 struct ftrace_graph_ret_entry *field; 944 struct ftrace_graph_ret_entry *field;
@@ -899,6 +1020,8 @@ static struct tracer graph_trace __read_mostly = {
899 1020
900static __init int init_graph_trace(void) 1021static __init int init_graph_trace(void)
901{ 1022{
1023 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1024
902 return register_tracer(&graph_trace); 1025 return register_tracer(&graph_trace);
903} 1026}
904 1027
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..5555b75a0d12 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -178,7 +178,6 @@ out_unlock:
178out: 178out:
179 data->critical_sequence = max_sequence; 179 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 180 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 181 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 182}
184 183
@@ -208,7 +207,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 207 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 208 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 209 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 210
213 local_save_flags(flags); 211 local_save_flags(flags);
214 212
@@ -379,6 +377,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 377 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 378 /* make sure that the tracer is visible */
381 smp_wmb(); 379 smp_wmb();
380 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 381 start_irqsoff_tracer(tr);
383} 382}
384 383
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..c4c9bbda53d3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,12 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 311 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 312 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 313 int pc = preempt_count();
313 314
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 315 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 316 sizeof(*entry), 0, pc);
316 if (!event) { 317 if (!event) {
317 atomic_inc(&dropped_count); 318 atomic_inc(&dropped_count);
@@ -319,7 +320,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 320 }
320 entry = ring_buffer_event_data(event); 321 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 322 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 323 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 324}
324 325
325void mmio_trace_rw(struct mmiotrace_rw *rw) 326void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +334,12 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 334 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 335 struct mmiotrace_map *map)
335{ 336{
337 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 338 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 339 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 340 int pc = preempt_count();
339 341
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 342 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 343 sizeof(*entry), 0, pc);
342 if (!event) { 344 if (!event) {
343 atomic_inc(&dropped_count); 345 atomic_inc(&dropped_count);
@@ -345,7 +347,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 347 }
346 entry = ring_buffer_event_data(event); 348 entry = ring_buffer_event_data(event);
347 entry->map = *map; 349 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 350 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 351}
350 352
351void mmio_trace_mapping(struct mmiotrace_map *map) 353void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8a30d9874cd4..fe1a00f1445a 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power; 39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
41 struct trace_power *entry; 42 struct trace_power *entry;
42 struct trace_array_cpu *data; 43 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace; 44 struct trace_array *tr = power_trace;
@@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it)
45 if (!trace_power_enabled) 46 if (!trace_power_enabled)
46 return; 47 return;
47 48
49 buffer = tr->buffer;
50
48 preempt_disable(); 51 preempt_disable();
49 it->end = ktime_get(); 52 it->end = ktime_get();
50 data = tr->data[smp_processor_id()]; 53 data = tr->data[smp_processor_id()];
51 54
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
53 sizeof(*entry), 0, 0); 56 sizeof(*entry), 0, 0);
54 if (!event) 57 if (!event)
55 goto out; 58 goto out;
56 entry = ring_buffer_event_data(event); 59 entry = ring_buffer_event_data(event);
57 entry->state_data = *it; 60 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event)) 61 if (!filter_check_discard(call, entry, buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0); 62 trace_buffer_unlock_commit(buffer, event, 0, 0);
60 out: 63 out:
61 preempt_enable(); 64 preempt_enable();
62} 65}
@@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
66{ 69{
67 struct ftrace_event_call *call = &event_power; 70 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event; 71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
69 struct trace_power *entry; 73 struct trace_power *entry;
70 struct trace_array_cpu *data; 74 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace; 75 struct trace_array *tr = power_trace;
@@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
73 if (!trace_power_enabled) 77 if (!trace_power_enabled)
74 return; 78 return;
75 79
80 buffer = tr->buffer;
81
76 memset(it, 0, sizeof(struct power_trace)); 82 memset(it, 0, sizeof(struct power_trace));
77 it->state = level; 83 it->state = level;
78 it->type = type; 84 it->type = type;
@@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
81 it->end = it->stamp; 87 it->end = it->stamp;
82 data = tr->data[smp_processor_id()]; 88 data = tr->data[smp_processor_id()];
83 89
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
85 sizeof(*entry), 0, 0); 91 sizeof(*entry), 0, 0);
86 if (!event) 92 if (!event)
87 goto out; 93 goto out;
88 entry = ring_buffer_event_data(event); 94 entry = ring_buffer_event_data(event);
89 entry->state_data = *it; 95 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event)) 96 if (!filter_check_discard(call, entry, buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0); 97 trace_buffer_unlock_commit(buffer, event, 0, 0);
92 out: 98 out:
93 preempt_enable(); 99 preempt_enable();
94} 100}
@@ -144,14 +150,12 @@ static void power_trace_reset(struct trace_array *tr)
144 150
145static int power_trace_init(struct trace_array *tr) 151static int power_trace_init(struct trace_array *tr)
146{ 152{
147 int cpu;
148 power_trace = tr; 153 power_trace = tr;
149 154
150 trace_power_enabled = 1; 155 trace_power_enabled = 1;
151 tracing_power_register(); 156 tracing_power_register();
152 157
153 for_each_cpu(cpu, cpu_possible_mask) 158 tracing_reset_online_cpus(tr);
154 tracing_reset(tr, cpu);
155 return 0; 159 return 0;
156} 160}
157 161
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
182 const char *str = *fmt; 176 const char *str = *fmt;
183 int i; 177 int i;
184 178
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
186 180
187 /* 181 /*
188 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..ad69f105a7c6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -186,11 +186,6 @@ out:
186 186
187static void __wakeup_reset(struct trace_array *tr) 187static void __wakeup_reset(struct trace_array *tr)
188{ 188{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 189 wakeup_cpu = -1;
195 wakeup_prio = -1; 190 wakeup_prio = -1;
196 191
@@ -204,6 +199,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 199{
205 unsigned long flags; 200 unsigned long flags;
206 201
202 tracing_reset_online_cpus(tr);
203
207 local_irq_save(flags); 204 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 205 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 206 __wakeup_reset(tr);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 71f2edb0fd84..7179c12e4f0f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -289,6 +289,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
289 * to detect and recover from possible hangs 289 * to detect and recover from possible hangs
290 */ 290 */
291 tracing_reset_online_cpus(tr); 291 tracing_reset_online_cpus(tr);
292 set_graph_array(tr);
292 ret = register_ftrace_graph(&trace_graph_return, 293 ret = register_ftrace_graph(&trace_graph_return,
293 &trace_graph_entry_watchdog); 294 &trace_graph_entry_watchdog);
294 if (ret) { 295 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192
193 (*pos)++;
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201 192
202 if (i >= max_stack_trace.nr_entries || 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239 227
240 sprint_symbol(str, addr); 228 return seq_printf(m, "%pF\n", (void *)addr);
241
242 return seq_printf(m, "%s\n", str);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
@@ -301,17 +284,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 284
302static int stack_trace_open(struct inode *inode, struct file *file) 285static int stack_trace_open(struct inode *inode, struct file *file)
303{ 286{
304 int ret; 287 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 288}
310 289
311static const struct file_operations stack_trace_fops = { 290static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 291 .open = stack_trace_open,
313 .read = seq_read, 292 .read = seq_read,
314 .llseek = seq_lseek, 293 .llseek = seq_lseek,
294 .release = seq_release,
315}; 295};
316 296
317int 297int
@@ -326,10 +306,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 307
328 if (ret || !write || 308 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 310 goto out;
331 311
332 last_stack_tracer_enabled = stack_tracer_enabled; 312 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 313
334 if (stack_tracer_enabled) 314 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 315 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,26 +68,35 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
73 } 76 }
74} 77}
75 78
76static void reset_stat_session(struct stat_session *session) 79static void __reset_stat_session(struct stat_session *session)
77{ 80{
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
85 88
89static void reset_stat_session(struct stat_session *session)
90{
91 mutex_lock(&session->stat_mutex);
92 __reset_stat_session(session);
93 mutex_unlock(&session->stat_mutex);
94}
95
86static void destroy_session(struct stat_session *session) 96static void destroy_session(struct stat_session *session)
87{ 97{
88 debugfs_remove(session->file); 98 debugfs_remove(session->file);
89 reset_stat_session(session); 99 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 100 mutex_destroy(&session->stat_mutex);
91 kfree(session); 101 kfree(session);
92} 102}
@@ -150,7 +160,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 160 int i;
151 161
152 mutex_lock(&session->stat_mutex); 162 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 163 __reset_stat_session(session);
154 164
155 if (!ts->stat_cmp) 165 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 166 ts->stat_cmp = dummy_cmp;
@@ -183,7 +193,7 @@ exit:
183 return ret; 193 return ret;
184 194
185exit_free_rbtree: 195exit_free_rbtree:
186 reset_stat_session(session); 196 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 197 mutex_unlock(&session->stat_mutex);
188 return ret; 198 return ret;
189} 199}
@@ -193,23 +203,23 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
193{ 203{
194 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
195 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
196 int i; 207 int i;
197 208
198 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
199 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
200 211
201 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 213 if (session->ts->stat_headers) {
203 (*pos)++; 214 if (n == 0)
204 return SEQ_START_TOKEN; 215 return SEQ_START_TOKEN;
216 n--;
205 } 217 }
206 218
207 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
209 node = rb_next(node); 221 node = rb_next(node);
210 222
211 (*pos)++;
212
213 return node; 223 return node;
214} 224}
215 225
@@ -254,16 +264,21 @@ static const struct seq_operations trace_stat_seq_ops = {
254static int tracing_stat_open(struct inode *inode, struct file *file) 264static int tracing_stat_open(struct inode *inode, struct file *file)
255{ 265{
256 int ret; 266 int ret;
257 267 struct seq_file *m;
258 struct stat_session *session = inode->i_private; 268 struct stat_session *session = inode->i_private;
259 269
270 ret = stat_seq_init(session);
271 if (ret)
272 return ret;
273
260 ret = seq_open(file, &trace_stat_seq_ops); 274 ret = seq_open(file, &trace_stat_seq_ops);
261 if (!ret) { 275 if (ret) {
262 struct seq_file *m = file->private_data; 276 reset_stat_session(session);
263 m->private = session; 277 return ret;
264 ret = stat_seq_init(session);
265 } 278 }
266 279
280 m = file->private_data;
281 m->private = session;
267 return ret; 282 return ret;
268} 283}
269 284
@@ -274,11 +289,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
274{ 289{
275 struct stat_session *session = i->i_private; 290 struct stat_session *session = i->i_private;
276 291
277 mutex_lock(&session->stat_mutex);
278 reset_stat_session(session); 292 reset_stat_session(session);
279 mutex_unlock(&session->stat_mutex);
280 293
281 return 0; 294 return seq_release(i, f);
282} 295}
283 296
284static const struct file_operations tracing_stat_fops = { 297static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..8712ce3c6a0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_counter.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,244 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data;
389 int syscall_nr;
390 int size;
233 391
234 ret = register_ftrace_event(&syscall_enter_event); 392 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 394 return;
237 syscall_enter_event.type); 395
238 WARN_ON_ONCE(1); 396 sys_data = syscall_nr_to_meta(syscall_nr);
397 if (!sys_data)
398 return;
399
400 /* get the size after alignment with the u32 buffer size field */
401 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
402 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32);
404
405 do {
406 char raw_data[size];
407
408 /* zero the dead bytes from align to not leak stack to user */
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
410
411 rec = (struct syscall_trace_enter *) raw_data;
412 tracing_generic_entry_update(&rec->ent, 0, 0);
413 rec->ent.type = sys_data->enter_id;
414 rec->nr = syscall_nr;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
416 (unsigned long *)&rec->args);
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
418 } while(0);
419}
420
421int reg_prof_syscall_enter(char *name)
422{
423 int ret = 0;
424 int num;
425
426 num = syscall_name_to_nr(name);
427 if (num < 0 || num >= NR_syscalls)
428 return -ENOSYS;
429
430 mutex_lock(&syscall_trace_lock);
431 if (!sys_prof_refcount_enter)
432 ret = register_trace_sys_enter(prof_syscall_enter);
433 if (ret) {
434 pr_info("event trace: Could not activate"
435 "syscall entry trace point");
436 } else {
437 set_bit(num, enabled_prof_enter_syscalls);
438 sys_prof_refcount_enter++;
239 } 439 }
440 mutex_unlock(&syscall_trace_lock);
441 return ret;
442}
240 443
241 ret = register_ftrace_event(&syscall_exit_event); 444void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 445{
243 printk(KERN_WARNING "event %d failed to register\n", 446 int num;
244 syscall_exit_event.type); 447
245 WARN_ON_ONCE(1); 448 num = syscall_name_to_nr(name);
449 if (num < 0 || num >= NR_syscalls)
450 return;
451
452 mutex_lock(&syscall_trace_lock);
453 sys_prof_refcount_enter--;
454 clear_bit(num, enabled_prof_enter_syscalls);
455 if (!sys_prof_refcount_enter)
456 unregister_trace_sys_enter(prof_syscall_enter);
457 mutex_unlock(&syscall_trace_lock);
458}
459
460static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{
462 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec;
464 int syscall_nr;
465
466 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
468 return;
469
470 sys_data = syscall_nr_to_meta(syscall_nr);
471 if (!sys_data)
472 return;
473
474 tracing_generic_entry_update(&rec.ent, 0, 0);
475 rec.ent.type = sys_data->exit_id;
476 rec.nr = syscall_nr;
477 rec.ret = syscall_get_return_value(current, regs);
478
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
480}
481
482int reg_prof_syscall_exit(char *name)
483{
484 int ret = 0;
485 int num;
486
487 num = syscall_name_to_nr(name);
488 if (num < 0 || num >= NR_syscalls)
489 return -ENOSYS;
490
491 mutex_lock(&syscall_trace_lock);
492 if (!sys_prof_refcount_exit)
493 ret = register_trace_sys_exit(prof_syscall_exit);
494 if (ret) {
495 pr_info("event trace: Could not activate"
496 "syscall entry trace point");
497 } else {
498 set_bit(num, enabled_prof_exit_syscalls);
499 sys_prof_refcount_exit++;
246 } 500 }
501 mutex_unlock(&syscall_trace_lock);
502 return ret;
503}
247 504
248 return register_tracer(&syscall_tracer); 505void unreg_prof_syscall_exit(char *name)
506{
507 int num;
508
509 num = syscall_name_to_nr(name);
510 if (num < 0 || num >= NR_syscalls)
511 return;
512
513 mutex_lock(&syscall_trace_lock);
514 sys_prof_refcount_exit--;
515 clear_bit(num, enabled_prof_exit_syscalls);
516 if (!sys_prof_refcount_exit)
517 unregister_trace_sys_exit(prof_syscall_exit);
518 mutex_unlock(&syscall_trace_lock);
249} 519}
250device_initcall(register_ftrace_syscalls); 520
521#endif
522
523
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..9489a0a9b1be 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af8..8a82b4b8ea52 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18static struct uts_namespace *create_uts_ns(void)
19{
20 struct uts_namespace *uts_ns;
21
22 uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
23 if (uts_ns)
24 kref_init(&uts_ns->kref);
25 return uts_ns;
26}
27
18/* 28/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 29 * Clone a new ns copying an original utsname, setting refcount to 1
20 * @old_ns: namespace to clone 30 * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24{ 34{
25 struct uts_namespace *ns; 35 struct uts_namespace *ns;
26 36
27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 37 ns = create_uts_ns();
28 if (!ns) 38 if (!ns)
29 return ERR_PTR(-ENOMEM); 39 return ERR_PTR(-ENOMEM);
30 40
31 down_read(&uts_sem); 41 down_read(&uts_sem);
32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem); 43 up_read(&uts_sem);
34 kref_init(&ns->kref);
35 return ns; 44 return ns;
36} 45}
37 46
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275cf..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{