aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c66
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c14
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/exit.c294
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c46
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/hrtimer.c112
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/kthread.c80
-rw-r--r--kernel/module.c22
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/perf_counter.c632
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/pid_namespace.c24
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/ptrace.c163
-rw-r--r--kernel/rcutree.c3
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c33
-rw-r--r--kernel/sched_cpupri.c2
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c3
-rw-r--r--kernel/signal.c12
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/sysctl.c17
-rw-r--r--kernel/time/clockevents.c11
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/blktrace.c1
-rw-r--r--kernel/trace/ftrace.c72
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/ring_buffer.c322
-rw-r--r--kernel/trace/ring_buffer_benchmark.c45
-rw-r--r--kernel/trace/trace.c32
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c28
-rw-r--r--kernel/trace/trace_events_filter.c37
-rw-r--r--kernel/trace/trace_functions.c11
-rw-r--r--kernel/trace/trace_functions_graph.c36
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c26
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c6
-rw-r--r--kernel/utsname.c13
63 files changed, 3496 insertions, 1552 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9df4501cb921..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,8 +69,9 @@ obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/
74obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
75obj-$(CONFIG_KPROBES) += kprobes.o 76obj-$(CONFIG_KPROBES) += kprobes.o
76obj-$(CONFIG_KGDB) += kgdb.o 77obj-$(CONFIG_KGDB) += kgdb.o
@@ -95,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
95obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
96obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
97obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/
98obj-$(CONFIG_SMP) += sched_cpupri.o 100obj-$(CONFIG_SMP) += sched_cpupri.o
99obj-$(CONFIG_SLOW_WORK) += slow-work.o 101obj-$(CONFIG_SLOW_WORK) += slow-work.o
100obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fb789f6df94..3737a682cdf5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -843,6 +843,11 @@ static int parse_cgroupfs_options(char *data,
843 struct cgroup_sb_opts *opts) 843 struct cgroup_sb_opts *opts)
844{ 844{
845 char *token, *o = data ?: "all"; 845 char *token, *o = data ?: "all";
846 unsigned long mask = (unsigned long)-1;
847
848#ifdef CONFIG_CPUSETS
849 mask = ~(1UL << cpuset_subsys_id);
850#endif
846 851
847 opts->subsys_bits = 0; 852 opts->subsys_bits = 0;
848 opts->flags = 0; 853 opts->flags = 0;
@@ -887,6 +892,15 @@ static int parse_cgroupfs_options(char *data,
887 } 892 }
888 } 893 }
889 894
895 /*
896 * Option noprefix was introduced just for backward compatibility
897 * with the old cpuset, so we allow noprefix only if mounting just
898 * the cpuset subsystem.
899 */
900 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
901 (opts->subsys_bits & mask))
902 return -EINVAL;
903
890 /* We can't have an empty hierarchy */ 904 /* We can't have an empty hierarchy */
891 if (!opts->subsys_bits) 905 if (!opts->subsys_bits)
892 return -EINVAL; 906 return -EINVAL;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
34 * an ongoing cpu hotplug operation. 34 * an ongoing cpu hotplug operation.
35 */ 35 */
36 int refcount; 36 int refcount;
37} cpu_hotplug; 37} cpu_hotplug = {
38 38 .active_writer = NULL,
39void __init cpu_hotplug_init(void) 39 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
40{ 40 .refcount = 0,
41 cpu_hotplug.active_writer = NULL; 41};
42 mutex_init(&cpu_hotplug.lock);
43 cpu_hotplug.refcount = 0;
44}
45 42
46#ifdef CONFIG_HOTPLUG_CPU 43#ifdef CONFIG_HOTPLUG_CPU
47 44
diff --git a/kernel/exit.c b/kernel/exit.c
index b6c90b5ef509..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -375,9 +374,8 @@ static void set_special_pids(struct pid *pid)
375} 374}
376 375
377/* 376/*
378 * Let kernel threads use this to say that they 377 * Let kernel threads use this to say that they allow a certain signal.
379 * allow a certain signal (since daemonize() will 378 * Must not be used if kthread was cloned with CLONE_SIGHAND.
380 * have disabled all of them by default).
381 */ 379 */
382int allow_signal(int sig) 380int allow_signal(int sig)
383{ 381{
@@ -385,14 +383,14 @@ int allow_signal(int sig)
385 return -EINVAL; 383 return -EINVAL;
386 384
387 spin_lock_irq(&current->sighand->siglock); 385 spin_lock_irq(&current->sighand->siglock);
386 /* This is only needed for daemonize()'ed kthreads */
388 sigdelset(&current->blocked, sig); 387 sigdelset(&current->blocked, sig);
389 if (!current->mm) { 388 /*
390 /* Kernel threads handle their own signals. 389 * Kernel threads handle their own signals. Let the signal code
391 Let the signal code know it'll be handled, so 390 * know it'll be handled, so that they don't get converted to
392 that they don't get converted to SIGKILL or 391 * SIGKILL or just silently dropped.
393 just silently dropped */ 392 */
394 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 393 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
395 }
396 recalc_sigpending(); 394 recalc_sigpending();
397 spin_unlock_irq(&current->sighand->siglock); 395 spin_unlock_irq(&current->sighand->siglock);
398 return 0; 396 return 0;
@@ -591,7 +589,7 @@ retry:
591 /* 589 /*
592 * Search in the siblings 590 * Search in the siblings
593 */ 591 */
594 list_for_each_entry(c, &p->parent->children, sibling) { 592 list_for_each_entry(c, &p->real_parent->children, sibling) {
595 if (c->mm == mm) 593 if (c->mm == mm)
596 goto assign_new_owner; 594 goto assign_new_owner;
597 } 595 }
@@ -758,7 +756,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
758 p->exit_signal = SIGCHLD; 756 p->exit_signal = SIGCHLD;
759 757
760 /* If it has exited notify the new parent about this child's death. */ 758 /* If it has exited notify the new parent about this child's death. */
761 if (!p->ptrace && 759 if (!task_ptrace(p) &&
762 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 760 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
763 do_notify_parent(p, p->exit_signal); 761 do_notify_parent(p, p->exit_signal);
764 if (task_detached(p)) { 762 if (task_detached(p)) {
@@ -783,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
783 list_for_each_entry_safe(p, n, &father->children, sibling) { 781 list_for_each_entry_safe(p, n, &father->children, sibling) {
784 p->real_parent = reaper; 782 p->real_parent = reaper;
785 if (p->parent == father) { 783 if (p->parent == father) {
786 BUG_ON(p->ptrace); 784 BUG_ON(task_ptrace(p));
787 p->parent = p->real_parent; 785 p->parent = p->real_parent;
788 } 786 }
789 reparent_thread(father, p, &dead_children); 787 reparent_thread(father, p, &dead_children);
@@ -1081,6 +1079,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
1081 return 0; 1079 return 0;
1082} 1080}
1083 1081
1082struct wait_opts {
1083 enum pid_type wo_type;
1084 int wo_flags;
1085 struct pid *wo_pid;
1086
1087 struct siginfo __user *wo_info;
1088 int __user *wo_stat;
1089 struct rusage __user *wo_rusage;
1090
1091 int notask_error;
1092};
1093
1084static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1094static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1085{ 1095{
1086 struct pid *pid = NULL; 1096 struct pid *pid = NULL;
@@ -1091,13 +1101,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1091 return pid; 1101 return pid;
1092} 1102}
1093 1103
1094static int eligible_child(enum pid_type type, struct pid *pid, int options, 1104static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1095 struct task_struct *p)
1096{ 1105{
1097 int err; 1106 int err;
1098 1107
1099 if (type < PIDTYPE_MAX) { 1108 if (wo->wo_type < PIDTYPE_MAX) {
1100 if (task_pid_type(p, type) != pid) 1109 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1101 return 0; 1110 return 0;
1102 } 1111 }
1103 1112
@@ -1106,8 +1115,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1106 * set; otherwise, wait for non-clone children *only*. (Note: 1115 * set; otherwise, wait for non-clone children *only*. (Note:
1107 * A "clone" child here is one that reports to its parent 1116 * A "clone" child here is one that reports to its parent
1108 * using a signal other than SIGCHLD.) */ 1117 * using a signal other than SIGCHLD.) */
1109 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) 1118 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1110 && !(options & __WALL)) 1119 && !(wo->wo_flags & __WALL))
1111 return 0; 1120 return 0;
1112 1121
1113 err = security_task_wait(p); 1122 err = security_task_wait(p);
@@ -1117,14 +1126,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1117 return 1; 1126 return 1;
1118} 1127}
1119 1128
1120static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1129static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1121 int why, int status, 1130 pid_t pid, uid_t uid, int why, int status)
1122 struct siginfo __user *infop,
1123 struct rusage __user *rusagep)
1124{ 1131{
1125 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1132 struct siginfo __user *infop;
1133 int retval = wo->wo_rusage
1134 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1126 1135
1127 put_task_struct(p); 1136 put_task_struct(p);
1137 infop = wo->wo_info;
1128 if (!retval) 1138 if (!retval)
1129 retval = put_user(SIGCHLD, &infop->si_signo); 1139 retval = put_user(SIGCHLD, &infop->si_signo);
1130 if (!retval) 1140 if (!retval)
@@ -1148,19 +1158,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1148 * the lock and this task is uninteresting. If we return nonzero, we have 1158 * the lock and this task is uninteresting. If we return nonzero, we have
1149 * released the lock and the system call should return. 1159 * released the lock and the system call should return.
1150 */ 1160 */
1151static int wait_task_zombie(struct task_struct *p, int options, 1161static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1152 struct siginfo __user *infop,
1153 int __user *stat_addr, struct rusage __user *ru)
1154{ 1162{
1155 unsigned long state; 1163 unsigned long state;
1156 int retval, status, traced; 1164 int retval, status, traced;
1157 pid_t pid = task_pid_vnr(p); 1165 pid_t pid = task_pid_vnr(p);
1158 uid_t uid = __task_cred(p)->uid; 1166 uid_t uid = __task_cred(p)->uid;
1167 struct siginfo __user *infop;
1159 1168
1160 if (!likely(options & WEXITED)) 1169 if (!likely(wo->wo_flags & WEXITED))
1161 return 0; 1170 return 0;
1162 1171
1163 if (unlikely(options & WNOWAIT)) { 1172 if (unlikely(wo->wo_flags & WNOWAIT)) {
1164 int exit_code = p->exit_code; 1173 int exit_code = p->exit_code;
1165 int why, status; 1174 int why, status;
1166 1175
@@ -1173,8 +1182,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1173 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1182 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1174 status = exit_code & 0x7f; 1183 status = exit_code & 0x7f;
1175 } 1184 }
1176 return wait_noreap_copyout(p, pid, uid, why, 1185 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1177 status, infop, ru);
1178 } 1186 }
1179 1187
1180 /* 1188 /*
@@ -1188,11 +1196,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
1188 } 1196 }
1189 1197
1190 traced = ptrace_reparented(p); 1198 traced = ptrace_reparented(p);
1191 1199 /*
1192 if (likely(!traced)) { 1200 * It can be ptraced but not reparented, check
1201 * !task_detached() to filter out sub-threads.
1202 */
1203 if (likely(!traced) && likely(!task_detached(p))) {
1193 struct signal_struct *psig; 1204 struct signal_struct *psig;
1194 struct signal_struct *sig; 1205 struct signal_struct *sig;
1195 struct task_cputime cputime;
1196 1206
1197 /* 1207 /*
1198 * The resource counters for the group leader are in its 1208 * The resource counters for the group leader are in its
@@ -1205,26 +1215,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1205 * p->signal fields, because they are only touched by 1215 * p->signal fields, because they are only touched by
1206 * __exit_signal, which runs with tasklist_lock 1216 * __exit_signal, which runs with tasklist_lock
1207 * write-locked anyway, and so is excluded here. We do 1217 * write-locked anyway, and so is excluded here. We do
1208 * need to protect the access to p->parent->signal fields, 1218 * need to protect the access to parent->signal fields,
1209 * as other threads in the parent group can be right 1219 * as other threads in the parent group can be right
1210 * here reaping other children at the same time. 1220 * here reaping other children at the same time.
1211 *
1212 * We use thread_group_cputime() to get times for the thread
1213 * group, which consolidates times for all threads in the
1214 * group including the group leader.
1215 */ 1221 */
1216 thread_group_cputime(p, &cputime); 1222 spin_lock_irq(&p->real_parent->sighand->siglock);
1217 spin_lock_irq(&p->parent->sighand->siglock); 1223 psig = p->real_parent->signal;
1218 psig = p->parent->signal;
1219 sig = p->signal; 1224 sig = p->signal;
1220 psig->cutime = 1225 psig->cutime =
1221 cputime_add(psig->cutime, 1226 cputime_add(psig->cutime,
1222 cputime_add(cputime.utime, 1227 cputime_add(p->utime,
1223 sig->cutime)); 1228 cputime_add(sig->utime,
1229 sig->cutime)));
1224 psig->cstime = 1230 psig->cstime =
1225 cputime_add(psig->cstime, 1231 cputime_add(psig->cstime,
1226 cputime_add(cputime.stime, 1232 cputime_add(p->stime,
1227 sig->cstime)); 1233 cputime_add(sig->stime,
1234 sig->cstime)));
1228 psig->cgtime = 1235 psig->cgtime =
1229 cputime_add(psig->cgtime, 1236 cputime_add(psig->cgtime,
1230 cputime_add(p->gtime, 1237 cputime_add(p->gtime,
@@ -1246,7 +1253,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1246 sig->oublock + sig->coublock; 1253 sig->oublock + sig->coublock;
1247 task_io_accounting_add(&psig->ioac, &p->ioac); 1254 task_io_accounting_add(&psig->ioac, &p->ioac);
1248 task_io_accounting_add(&psig->ioac, &sig->ioac); 1255 task_io_accounting_add(&psig->ioac, &sig->ioac);
1249 spin_unlock_irq(&p->parent->sighand->siglock); 1256 spin_unlock_irq(&p->real_parent->sighand->siglock);
1250 } 1257 }
1251 1258
1252 /* 1259 /*
@@ -1255,11 +1262,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
1255 */ 1262 */
1256 read_unlock(&tasklist_lock); 1263 read_unlock(&tasklist_lock);
1257 1264
1258 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1265 retval = wo->wo_rusage
1266 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1259 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1267 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1260 ? p->signal->group_exit_code : p->exit_code; 1268 ? p->signal->group_exit_code : p->exit_code;
1261 if (!retval && stat_addr) 1269 if (!retval && wo->wo_stat)
1262 retval = put_user(status, stat_addr); 1270 retval = put_user(status, wo->wo_stat);
1271
1272 infop = wo->wo_info;
1263 if (!retval && infop) 1273 if (!retval && infop)
1264 retval = put_user(SIGCHLD, &infop->si_signo); 1274 retval = put_user(SIGCHLD, &infop->si_signo);
1265 if (!retval && infop) 1275 if (!retval && infop)
@@ -1327,15 +1337,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1327 * the lock and this task is uninteresting. If we return nonzero, we have 1337 * the lock and this task is uninteresting. If we return nonzero, we have
1328 * released the lock and the system call should return. 1338 * released the lock and the system call should return.
1329 */ 1339 */
1330static int wait_task_stopped(int ptrace, struct task_struct *p, 1340static int wait_task_stopped(struct wait_opts *wo,
1331 int options, struct siginfo __user *infop, 1341 int ptrace, struct task_struct *p)
1332 int __user *stat_addr, struct rusage __user *ru)
1333{ 1342{
1343 struct siginfo __user *infop;
1334 int retval, exit_code, *p_code, why; 1344 int retval, exit_code, *p_code, why;
1335 uid_t uid = 0; /* unneeded, required by compiler */ 1345 uid_t uid = 0; /* unneeded, required by compiler */
1336 pid_t pid; 1346 pid_t pid;
1337 1347
1338 if (!(options & WUNTRACED)) 1348 /*
1349 * Traditionally we see ptrace'd stopped tasks regardless of options.
1350 */
1351 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1339 return 0; 1352 return 0;
1340 1353
1341 exit_code = 0; 1354 exit_code = 0;
@@ -1349,7 +1362,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1349 if (!exit_code) 1362 if (!exit_code)
1350 goto unlock_sig; 1363 goto unlock_sig;
1351 1364
1352 if (!unlikely(options & WNOWAIT)) 1365 if (!unlikely(wo->wo_flags & WNOWAIT))
1353 *p_code = 0; 1366 *p_code = 0;
1354 1367
1355 /* don't need the RCU readlock here as we're holding a spinlock */ 1368 /* don't need the RCU readlock here as we're holding a spinlock */
@@ -1371,14 +1384,15 @@ unlock_sig:
1371 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1384 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1372 read_unlock(&tasklist_lock); 1385 read_unlock(&tasklist_lock);
1373 1386
1374 if (unlikely(options & WNOWAIT)) 1387 if (unlikely(wo->wo_flags & WNOWAIT))
1375 return wait_noreap_copyout(p, pid, uid, 1388 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1376 why, exit_code,
1377 infop, ru);
1378 1389
1379 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1390 retval = wo->wo_rusage
1380 if (!retval && stat_addr) 1391 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1381 retval = put_user((exit_code << 8) | 0x7f, stat_addr); 1392 if (!retval && wo->wo_stat)
1393 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1394
1395 infop = wo->wo_info;
1382 if (!retval && infop) 1396 if (!retval && infop)
1383 retval = put_user(SIGCHLD, &infop->si_signo); 1397 retval = put_user(SIGCHLD, &infop->si_signo);
1384 if (!retval && infop) 1398 if (!retval && infop)
@@ -1405,15 +1419,13 @@ unlock_sig:
1405 * the lock and this task is uninteresting. If we return nonzero, we have 1419 * the lock and this task is uninteresting. If we return nonzero, we have
1406 * released the lock and the system call should return. 1420 * released the lock and the system call should return.
1407 */ 1421 */
1408static int wait_task_continued(struct task_struct *p, int options, 1422static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1409 struct siginfo __user *infop,
1410 int __user *stat_addr, struct rusage __user *ru)
1411{ 1423{
1412 int retval; 1424 int retval;
1413 pid_t pid; 1425 pid_t pid;
1414 uid_t uid; 1426 uid_t uid;
1415 1427
1416 if (!unlikely(options & WCONTINUED)) 1428 if (!unlikely(wo->wo_flags & WCONTINUED))
1417 return 0; 1429 return 0;
1418 1430
1419 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1431 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1425,7 +1437,7 @@ static int wait_task_continued(struct task_struct *p, int options,
1425 spin_unlock_irq(&p->sighand->siglock); 1437 spin_unlock_irq(&p->sighand->siglock);
1426 return 0; 1438 return 0;
1427 } 1439 }
1428 if (!unlikely(options & WNOWAIT)) 1440 if (!unlikely(wo->wo_flags & WNOWAIT))
1429 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1441 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1430 uid = __task_cred(p)->uid; 1442 uid = __task_cred(p)->uid;
1431 spin_unlock_irq(&p->sighand->siglock); 1443 spin_unlock_irq(&p->sighand->siglock);
@@ -1434,17 +1446,17 @@ static int wait_task_continued(struct task_struct *p, int options,
1434 get_task_struct(p); 1446 get_task_struct(p);
1435 read_unlock(&tasklist_lock); 1447 read_unlock(&tasklist_lock);
1436 1448
1437 if (!infop) { 1449 if (!wo->wo_info) {
1438 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1450 retval = wo->wo_rusage
1451 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1439 put_task_struct(p); 1452 put_task_struct(p);
1440 if (!retval && stat_addr) 1453 if (!retval && wo->wo_stat)
1441 retval = put_user(0xffff, stat_addr); 1454 retval = put_user(0xffff, wo->wo_stat);
1442 if (!retval) 1455 if (!retval)
1443 retval = pid; 1456 retval = pid;
1444 } else { 1457 } else {
1445 retval = wait_noreap_copyout(p, pid, uid, 1458 retval = wait_noreap_copyout(wo, p, pid, uid,
1446 CLD_CONTINUED, SIGCONT, 1459 CLD_CONTINUED, SIGCONT);
1447 infop, ru);
1448 BUG_ON(retval == 0); 1460 BUG_ON(retval == 0);
1449 } 1461 }
1450 1462
@@ -1454,19 +1466,16 @@ static int wait_task_continued(struct task_struct *p, int options,
1454/* 1466/*
1455 * Consider @p for a wait by @parent. 1467 * Consider @p for a wait by @parent.
1456 * 1468 *
1457 * -ECHILD should be in *@notask_error before the first call. 1469 * -ECHILD should be in ->notask_error before the first call.
1458 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1470 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1459 * Returns zero if the search for a child should continue; 1471 * Returns zero if the search for a child should continue;
1460 * then *@notask_error is 0 if @p is an eligible child, 1472 * then ->notask_error is 0 if @p is an eligible child,
1461 * or another error from security_task_wait(), or still -ECHILD. 1473 * or another error from security_task_wait(), or still -ECHILD.
1462 */ 1474 */
1463static int wait_consider_task(struct task_struct *parent, int ptrace, 1475static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
1464 struct task_struct *p, int *notask_error, 1476 int ptrace, struct task_struct *p)
1465 enum pid_type type, struct pid *pid, int options,
1466 struct siginfo __user *infop,
1467 int __user *stat_addr, struct rusage __user *ru)
1468{ 1477{
1469 int ret = eligible_child(type, pid, options, p); 1478 int ret = eligible_child(wo, p);
1470 if (!ret) 1479 if (!ret)
1471 return ret; 1480 return ret;
1472 1481
@@ -1478,17 +1487,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1478 * to look for security policy problems, rather 1487 * to look for security policy problems, rather
1479 * than for mysterious wait bugs. 1488 * than for mysterious wait bugs.
1480 */ 1489 */
1481 if (*notask_error) 1490 if (wo->notask_error)
1482 *notask_error = ret; 1491 wo->notask_error = ret;
1483 return 0; 1492 return 0;
1484 } 1493 }
1485 1494
1486 if (likely(!ptrace) && unlikely(p->ptrace)) { 1495 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1487 /* 1496 /*
1488 * This child is hidden by ptrace. 1497 * This child is hidden by ptrace.
1489 * We aren't allowed to see it now, but eventually we will. 1498 * We aren't allowed to see it now, but eventually we will.
1490 */ 1499 */
1491 *notask_error = 0; 1500 wo->notask_error = 0;
1492 return 0; 1501 return 0;
1493 } 1502 }
1494 1503
@@ -1499,34 +1508,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1499 * We don't reap group leaders with subthreads. 1508 * We don't reap group leaders with subthreads.
1500 */ 1509 */
1501 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1510 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1502 return wait_task_zombie(p, options, infop, stat_addr, ru); 1511 return wait_task_zombie(wo, p);
1503 1512
1504 /* 1513 /*
1505 * It's stopped or running now, so it might 1514 * It's stopped or running now, so it might
1506 * later continue, exit, or stop again. 1515 * later continue, exit, or stop again.
1507 */ 1516 */
1508 *notask_error = 0; 1517 wo->notask_error = 0;
1509 1518
1510 if (task_stopped_code(p, ptrace)) 1519 if (task_stopped_code(p, ptrace))
1511 return wait_task_stopped(ptrace, p, options, 1520 return wait_task_stopped(wo, ptrace, p);
1512 infop, stat_addr, ru);
1513 1521
1514 return wait_task_continued(p, options, infop, stat_addr, ru); 1522 return wait_task_continued(wo, p);
1515} 1523}
1516 1524
1517/* 1525/*
1518 * Do the work of do_wait() for one thread in the group, @tsk. 1526 * Do the work of do_wait() for one thread in the group, @tsk.
1519 * 1527 *
1520 * -ECHILD should be in *@notask_error before the first call. 1528 * -ECHILD should be in ->notask_error before the first call.
1521 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1529 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1522 * Returns zero if the search for a child should continue; then 1530 * Returns zero if the search for a child should continue; then
1523 * *@notask_error is 0 if there were any eligible children, 1531 * ->notask_error is 0 if there were any eligible children,
1524 * or another error from security_task_wait(), or still -ECHILD. 1532 * or another error from security_task_wait(), or still -ECHILD.
1525 */ 1533 */
1526static int do_wait_thread(struct task_struct *tsk, int *notask_error, 1534static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1527 enum pid_type type, struct pid *pid, int options,
1528 struct siginfo __user *infop, int __user *stat_addr,
1529 struct rusage __user *ru)
1530{ 1535{
1531 struct task_struct *p; 1536 struct task_struct *p;
1532 1537
@@ -1535,9 +1540,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1535 * Do not consider detached threads. 1540 * Do not consider detached threads.
1536 */ 1541 */
1537 if (!task_detached(p)) { 1542 if (!task_detached(p)) {
1538 int ret = wait_consider_task(tsk, 0, p, notask_error, 1543 int ret = wait_consider_task(wo, tsk, 0, p);
1539 type, pid, options,
1540 infop, stat_addr, ru);
1541 if (ret) 1544 if (ret)
1542 return ret; 1545 return ret;
1543 } 1546 }
@@ -1546,22 +1549,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1546 return 0; 1549 return 0;
1547} 1550}
1548 1551
1549static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, 1552static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1550 enum pid_type type, struct pid *pid, int options,
1551 struct siginfo __user *infop, int __user *stat_addr,
1552 struct rusage __user *ru)
1553{ 1553{
1554 struct task_struct *p; 1554 struct task_struct *p;
1555 1555
1556 /*
1557 * Traditionally we see ptrace'd stopped tasks regardless of options.
1558 */
1559 options |= WUNTRACED;
1560
1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1556 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1562 int ret = wait_consider_task(tsk, 1, p, notask_error, 1557 int ret = wait_consider_task(wo, tsk, 1, p);
1563 type, pid, options,
1564 infop, stat_addr, ru);
1565 if (ret) 1558 if (ret)
1566 return ret; 1559 return ret;
1567 } 1560 }
@@ -1569,65 +1562,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1569 return 0; 1562 return 0;
1570} 1563}
1571 1564
1572static long do_wait(enum pid_type type, struct pid *pid, int options, 1565static long do_wait(struct wait_opts *wo)
1573 struct siginfo __user *infop, int __user *stat_addr,
1574 struct rusage __user *ru)
1575{ 1566{
1576 DECLARE_WAITQUEUE(wait, current); 1567 DECLARE_WAITQUEUE(wait, current);
1577 struct task_struct *tsk; 1568 struct task_struct *tsk;
1578 int retval; 1569 int retval;
1579 1570
1580 trace_sched_process_wait(pid); 1571 trace_sched_process_wait(wo->wo_pid);
1581 1572
1582 add_wait_queue(&current->signal->wait_chldexit,&wait); 1573 add_wait_queue(&current->signal->wait_chldexit,&wait);
1583repeat: 1574repeat:
1584 /* 1575 /*
1585 * If there is nothing that can match our critiera just get out. 1576 * If there is nothing that can match our critiera just get out.
1586 * We will clear @retval to zero if we see any child that might later 1577 * We will clear ->notask_error to zero if we see any child that
1587 * match our criteria, even if we are not able to reap it yet. 1578 * might later match our criteria, even if we are not able to reap
1579 * it yet.
1588 */ 1580 */
1589 retval = -ECHILD; 1581 wo->notask_error = -ECHILD;
1590 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1582 if ((wo->wo_type < PIDTYPE_MAX) &&
1591 goto end; 1583 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1584 goto notask;
1592 1585
1593 current->state = TASK_INTERRUPTIBLE; 1586 set_current_state(TASK_INTERRUPTIBLE);
1594 read_lock(&tasklist_lock); 1587 read_lock(&tasklist_lock);
1595 tsk = current; 1588 tsk = current;
1596 do { 1589 do {
1597 int tsk_result = do_wait_thread(tsk, &retval, 1590 retval = do_wait_thread(wo, tsk);
1598 type, pid, options, 1591 if (retval)
1599 infop, stat_addr, ru); 1592 goto end;
1600 if (!tsk_result) 1593
1601 tsk_result = ptrace_do_wait(tsk, &retval, 1594 retval = ptrace_do_wait(wo, tsk);
1602 type, pid, options, 1595 if (retval)
1603 infop, stat_addr, ru);
1604 if (tsk_result) {
1605 /*
1606 * tasklist_lock is unlocked and we have a final result.
1607 */
1608 retval = tsk_result;
1609 goto end; 1596 goto end;
1610 }
1611 1597
1612 if (options & __WNOTHREAD) 1598 if (wo->wo_flags & __WNOTHREAD)
1613 break; 1599 break;
1614 tsk = next_thread(tsk); 1600 } while_each_thread(current, tsk);
1615 BUG_ON(tsk->signal != current->signal);
1616 } while (tsk != current);
1617 read_unlock(&tasklist_lock); 1601 read_unlock(&tasklist_lock);
1618 1602
1619 if (!retval && !(options & WNOHANG)) { 1603notask:
1604 retval = wo->notask_error;
1605 if (!retval && !(wo->wo_flags & WNOHANG)) {
1620 retval = -ERESTARTSYS; 1606 retval = -ERESTARTSYS;
1621 if (!signal_pending(current)) { 1607 if (!signal_pending(current)) {
1622 schedule(); 1608 schedule();
1623 goto repeat; 1609 goto repeat;
1624 } 1610 }
1625 } 1611 }
1626
1627end: 1612end:
1628 current->state = TASK_RUNNING; 1613 __set_current_state(TASK_RUNNING);
1629 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1614 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1630 if (infop) { 1615 if (wo->wo_info) {
1616 struct siginfo __user *infop = wo->wo_info;
1617
1631 if (retval > 0) 1618 if (retval > 0)
1632 retval = 0; 1619 retval = 0;
1633 else { 1620 else {
@@ -1656,6 +1643,7 @@ end:
1656SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1643SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1657 infop, int, options, struct rusage __user *, ru) 1644 infop, int, options, struct rusage __user *, ru)
1658{ 1645{
1646 struct wait_opts wo;
1659 struct pid *pid = NULL; 1647 struct pid *pid = NULL;
1660 enum pid_type type; 1648 enum pid_type type;
1661 long ret; 1649 long ret;
@@ -1685,7 +1673,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1685 1673
1686 if (type < PIDTYPE_MAX) 1674 if (type < PIDTYPE_MAX)
1687 pid = find_get_pid(upid); 1675 pid = find_get_pid(upid);
1688 ret = do_wait(type, pid, options, infop, NULL, ru); 1676
1677 wo.wo_type = type;
1678 wo.wo_pid = pid;
1679 wo.wo_flags = options;
1680 wo.wo_info = infop;
1681 wo.wo_stat = NULL;
1682 wo.wo_rusage = ru;
1683 ret = do_wait(&wo);
1689 put_pid(pid); 1684 put_pid(pid);
1690 1685
1691 /* avoid REGPARM breakage on x86: */ 1686 /* avoid REGPARM breakage on x86: */
@@ -1696,6 +1691,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1696SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1691SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1697 int, options, struct rusage __user *, ru) 1692 int, options, struct rusage __user *, ru)
1698{ 1693{
1694 struct wait_opts wo;
1699 struct pid *pid = NULL; 1695 struct pid *pid = NULL;
1700 enum pid_type type; 1696 enum pid_type type;
1701 long ret; 1697 long ret;
@@ -1717,7 +1713,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1717 pid = find_get_pid(upid); 1713 pid = find_get_pid(upid);
1718 } 1714 }
1719 1715
1720 ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); 1716 wo.wo_type = type;
1717 wo.wo_pid = pid;
1718 wo.wo_flags = options | WEXITED;
1719 wo.wo_info = NULL;
1720 wo.wo_stat = stat_addr;
1721 wo.wo_rusage = ru;
1722 ret = do_wait(&wo);
1721 put_pid(pid); 1723 put_pid(pid);
1722 1724
1723 /* avoid REGPARM breakage on x86: */ 1725 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index be022c200da6..bd2959228871 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -1029,7 +1028,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1029 p->vfork_done = NULL; 1028 p->vfork_done = NULL;
1030 spin_lock_init(&p->alloc_lock); 1029 spin_lock_init(&p->alloc_lock);
1031 1030
1032 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1033 init_sigpending(&p->pending); 1031 init_sigpending(&p->pending);
1034 1032
1035 p->utime = cputime_zero; 1033 p->utime = cputime_zero;
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..0672ff88f159 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
247 if (err < 0) 247 if (err < 0)
248 return err; 248 return err;
249 249
250 page = compound_head(page);
250 lock_page(page); 251 lock_page(page);
251 if (!page->mapping) { 252 if (!page->mapping) {
252 unlock_page(page); 253 unlock_page(page);
@@ -284,6 +285,25 @@ void put_futex_key(int fshared, union futex_key *key)
284 drop_futex_key_refs(key); 285 drop_futex_key_refs(key);
285} 286}
286 287
288/*
289 * fault_in_user_writeable - fault in user address and verify RW access
290 * @uaddr: pointer to faulting user space address
291 *
292 * Slow path to fixup the fault we just took in the atomic write
293 * access to @uaddr.
294 *
295 * We have no generic implementation of a non destructive write to the
296 * user address. We know that we faulted in the atomic pagefault
297 * disabled section so we can as well avoid the #PF overhead by
298 * calling get_user_pages() right away.
299 */
300static int fault_in_user_writeable(u32 __user *uaddr)
301{
302 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
303 1, 1, 0, NULL, NULL);
304 return ret < 0 ? ret : 0;
305}
306
287/** 307/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex 308 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in 309 * @hb: the hash bucket the futex_q's reside in
@@ -896,7 +916,6 @@ retry:
896retry_private: 916retry_private:
897 op_ret = futex_atomic_op_inuser(op, uaddr2); 917 op_ret = futex_atomic_op_inuser(op, uaddr2);
898 if (unlikely(op_ret < 0)) { 918 if (unlikely(op_ret < 0)) {
899 u32 dummy;
900 919
901 double_unlock_hb(hb1, hb2); 920 double_unlock_hb(hb1, hb2);
902 921
@@ -914,7 +933,7 @@ retry_private:
914 goto out_put_keys; 933 goto out_put_keys;
915 } 934 }
916 935
917 ret = get_user(dummy, uaddr2); 936 ret = fault_in_user_writeable(uaddr2);
918 if (ret) 937 if (ret)
919 goto out_put_keys; 938 goto out_put_keys;
920 939
@@ -1204,7 +1223,7 @@ retry_private:
1204 double_unlock_hb(hb1, hb2); 1223 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2); 1224 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1); 1225 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2); 1226 ret = fault_in_user_writeable(uaddr2);
1208 if (!ret) 1227 if (!ret)
1209 goto retry; 1228 goto retry;
1210 goto out; 1229 goto out;
@@ -1482,7 +1501,7 @@ retry:
1482handle_fault: 1501handle_fault:
1483 spin_unlock(q->lock_ptr); 1502 spin_unlock(q->lock_ptr);
1484 1503
1485 ret = get_user(uval, uaddr); 1504 ret = fault_in_user_writeable(uaddr);
1486 1505
1487 spin_lock(q->lock_ptr); 1506 spin_lock(q->lock_ptr);
1488 1507
@@ -1807,7 +1826,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1807{ 1826{
1808 struct hrtimer_sleeper timeout, *to = NULL; 1827 struct hrtimer_sleeper timeout, *to = NULL;
1809 struct futex_hash_bucket *hb; 1828 struct futex_hash_bucket *hb;
1810 u32 uval;
1811 struct futex_q q; 1829 struct futex_q q;
1812 int res, ret; 1830 int res, ret;
1813 1831
@@ -1909,16 +1927,9 @@ out:
1909 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1927 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910 1928
1911uaddr_faulted: 1929uaddr_faulted:
1912 /*
1913 * We have to r/w *(int __user *)uaddr, and we have to modify it
1914 * atomically. Therefore, if we continue to fault after get_user()
1915 * below, we need to handle the fault ourselves, while still holding
1916 * the mmap_sem. This can occur if the uaddr is under contention as
1917 * we have to drop the mmap_sem in order to call get_user().
1918 */
1919 queue_unlock(&q, hb); 1930 queue_unlock(&q, hb);
1920 1931
1921 ret = get_user(uval, uaddr); 1932 ret = fault_in_user_writeable(uaddr);
1922 if (ret) 1933 if (ret)
1923 goto out_put_key; 1934 goto out_put_key;
1924 1935
@@ -2013,17 +2024,10 @@ out:
2013 return ret; 2024 return ret;
2014 2025
2015pi_faulted: 2026pi_faulted:
2016 /*
2017 * We have to r/w *(int __user *)uaddr, and we have to modify it
2018 * atomically. Therefore, if we continue to fault after get_user()
2019 * below, we need to handle the fault ourselves, while still holding
2020 * the mmap_sem. This can occur if the uaddr is under contention as
2021 * we have to drop the mmap_sem in order to call get_user().
2022 */
2023 spin_unlock(&hb->lock); 2027 spin_unlock(&hb->lock);
2024 put_futex_key(fshared, &key); 2028 put_futex_key(fshared, &key);
2025 2029
2026 ret = get_user(uval, uaddr); 2030 ret = fault_in_user_writeable(uaddr);
2027 if (!ret) 2031 if (!ret)
2028 goto retry; 2032 goto retry;
2029 2033
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000000..22e9dcfaa3d3
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
1menu "GCOV-based kernel profiling"
2
3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS
6 default n
7 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage
9 measurements).
10
11 If unsure, say N.
12
13 Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
14 for the entire kernel. To enable profiling for specific files or
15 directories, add a line similar to the following to the respective
16 Makefile:
17
18 For a single file (e.g. main.o):
19 GCOV_PROFILE_main.o := y
20
21 For all files in one directory:
22 GCOV_PROFILE := y
23
24 To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
25 is specified, use:
26
27 GCOV_PROFILE_main.o := n
28 and:
29 GCOV_PROFILE := n
30
31 Note that the debugfs filesystem has to be mounted to access
32 profiling data.
33
34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL
37 depends on S390 || X86
38 default n
39 ---help---
40 This options activates profiling for the entire kernel.
41
42 If unsure, say N.
43
44 Note that a kernel compiled with profiling flags will be significantly
45 larger and run slower. Also be sure to exclude files from profiling
46 which are not linked to the kernel image to prevent linker errors.
47
48endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000000..3f761001d517
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000000..9b22d03cc581
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
1/*
2 * This code maintains a list of active profiling data structures.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 */
15
16#define pr_fmt(fmt) "gcov: " fmt
17
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include "gcov.h"
22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock);
26
27/*
28 * __gcov_init is called by gcc-generated constructor code for each object
29 * file compiled with -fprofile-arcs.
30 */
31void __gcov_init(struct gcov_info *info)
32{
33 static unsigned int gcov_version;
34
35 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) {
37 gcov_version = info->version;
38 /*
39 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports.
41 */
42 pr_info("version magic: 0x%x\n", gcov_version);
43 }
44 /*
45 * Add new profiling data structure to list and inform event
46 * listener.
47 */
48 info->next = gcov_info_head;
49 gcov_info_head = info;
50 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock);
53}
54EXPORT_SYMBOL(__gcov_init);
55
56/*
57 * These functions may be referenced by gcc-generated profiling code but serve
58 * no function for kernel profiling.
59 */
60void __gcov_flush(void)
61{
62 /* Unused. */
63}
64EXPORT_SYMBOL(__gcov_flush);
65
66void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
67{
68 /* Unused. */
69}
70EXPORT_SYMBOL(__gcov_merge_add);
71
72void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
73{
74 /* Unused. */
75}
76EXPORT_SYMBOL(__gcov_merge_single);
77
78void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
79{
80 /* Unused. */
81}
82EXPORT_SYMBOL(__gcov_merge_delta);
83
84/**
85 * gcov_enable_events - enable event reporting through gcov_event()
86 *
87 * Turn on reporting of profiling data load/unload-events through the
88 * gcov_event() callback. Also replay all previous events once. This function
89 * is needed because some events are potentially generated too early for the
90 * callback implementation to handle them initially.
91 */
92void gcov_enable_events(void)
93{
94 struct gcov_info *info;
95
96 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1;
98 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next)
100 gcov_event(GCOV_ADD, info);
101 mutex_unlock(&gcov_lock);
102}
103
104#ifdef CONFIG_MODULES
105static inline int within(void *addr, void *start, unsigned long size)
106{
107 return ((addr >= start) && (addr < start + size));
108}
109
110/* Update list and generate events when modules are unloaded. */
111static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data)
113{
114 struct module *mod = data;
115 struct gcov_info *info;
116 struct gcov_info *prev;
117
118 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK;
120 mutex_lock(&gcov_lock);
121 prev = NULL;
122 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) {
124 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev)
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info);
131 } else
132 prev = info;
133 }
134 mutex_unlock(&gcov_lock);
135
136 return NOTIFY_OK;
137}
138
139static struct notifier_block gcov_nb = {
140 .notifier_call = gcov_module_notifier,
141};
142
143static int __init gcov_init(void)
144{
145 return register_module_notifier(&gcov_nb);
146}
147device_initcall(gcov_init);
148#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000000..ef3c3f88a7a3
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
1/*
2 * This code exports profiling data as debugfs files to userspace.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 * Yi CDL Yang
15 */
16
17#define pr_fmt(fmt) "gcov: " fmt
18
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/debugfs.h>
22#include <linux/fs.h>
23#include <linux/list.h>
24#include <linux/string.h>
25#include <linux/slab.h>
26#include <linux/mutex.h>
27#include <linux/seq_file.h>
28#include "gcov.h"
29
30/**
31 * struct gcov_node - represents a debugfs entry
32 * @list: list head for child node list
33 * @children: child nodes
34 * @all: list head for list of all nodes
35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory
37 * @ghost: when an object file containing profiling data is unloaded we keep a
38 * copy of the profiling data here to allow collecting coverage data
39 * for cleanup code. Such a node is called a "ghost".
40 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links
42 * @name: data file basename
43 *
44 * struct gcov_node represents an entity within the gcov/ subdirectory
45 * of debugfs. There are directory and data file nodes. The latter represent
46 * the actual synthesized data file plus any associated symbolic links which
47 * are needed by the gcov tool to work correctly.
48 */
49struct gcov_node {
50 struct list_head list;
51 struct list_head children;
52 struct list_head all;
53 struct gcov_node *parent;
54 struct gcov_info *info;
55 struct gcov_info *ghost;
56 struct dentry *dentry;
57 struct dentry **links;
58 char name[0];
59};
60
61static const char objtree[] = OBJTREE;
62static const char srctree[] = SRCTREE;
63static struct gcov_node root_node;
64static struct dentry *reset_dentry;
65static LIST_HEAD(all_head);
66static DEFINE_MUTEX(node_lock);
67
68/* If non-zero, keep copies of profiling data for unloaded modules. */
69static int gcov_persist = 1;
70
71static int __init gcov_persist_setup(char *str)
72{
73 unsigned long val;
74
75 if (strict_strtoul(str, 0, &val)) {
76 pr_warning("invalid gcov_persist parameter '%s'\n", str);
77 return 0;
78 }
79 gcov_persist = val;
80 pr_info("setting gcov_persist to %d\n", gcov_persist);
81
82 return 1;
83}
84__setup("gcov_persist=", gcov_persist_setup);
85
86/*
87 * seq_file.start() implementation for gcov data files. Note that the
88 * gcov_iterator interface is designed to be more restrictive than seq_file
89 * (no start from arbitrary position, etc.), to simplify the iterator
90 * implementation.
91 */
92static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
93{
94 loff_t i;
95
96 gcov_iter_start(seq->private);
97 for (i = 0; i < *pos; i++) {
98 if (gcov_iter_next(seq->private))
99 return NULL;
100 }
101 return seq->private;
102}
103
104/* seq_file.next() implementation for gcov data files. */
105static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
106{
107 struct gcov_iterator *iter = data;
108
109 if (gcov_iter_next(iter))
110 return NULL;
111 (*pos)++;
112
113 return iter;
114}
115
116/* seq_file.show() implementation for gcov data files. */
117static int gcov_seq_show(struct seq_file *seq, void *data)
118{
119 struct gcov_iterator *iter = data;
120
121 if (gcov_iter_write(iter, seq))
122 return -EINVAL;
123 return 0;
124}
125
126static void gcov_seq_stop(struct seq_file *seq, void *data)
127{
128 /* Unused. */
129}
130
131static const struct seq_operations gcov_seq_ops = {
132 .start = gcov_seq_start,
133 .next = gcov_seq_next,
134 .show = gcov_seq_show,
135 .stop = gcov_seq_stop,
136};
137
138/*
139 * Return the profiling data set for a given node. This can either be the
140 * original profiling data structure or a duplicate (also called "ghost")
141 * in case the associated object file has been unloaded.
142 */
143static struct gcov_info *get_node_info(struct gcov_node *node)
144{
145 if (node->info)
146 return node->info;
147
148 return node->ghost;
149}
150
151/*
152 * open() implementation for gcov data files. Create a copy of the profiling
153 * data set and initialize the iterator and seq_file interface.
154 */
155static int gcov_seq_open(struct inode *inode, struct file *file)
156{
157 struct gcov_node *node = inode->i_private;
158 struct gcov_iterator *iter;
159 struct seq_file *seq;
160 struct gcov_info *info;
161 int rc = -ENOMEM;
162
163 mutex_lock(&node_lock);
164 /*
165 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access.
167 */
168 info = gcov_info_dup(get_node_info(node));
169 if (!info)
170 goto out_unlock;
171 iter = gcov_iter_new(info);
172 if (!iter)
173 goto err_free_info;
174 rc = seq_open(file, &gcov_seq_ops);
175 if (rc)
176 goto err_free_iter_info;
177 seq = file->private_data;
178 seq->private = iter;
179out_unlock:
180 mutex_unlock(&node_lock);
181 return rc;
182
183err_free_iter_info:
184 gcov_iter_free(iter);
185err_free_info:
186 gcov_info_free(info);
187 goto out_unlock;
188}
189
190/*
191 * release() implementation for gcov data files. Release resources allocated
192 * by open().
193 */
194static int gcov_seq_release(struct inode *inode, struct file *file)
195{
196 struct gcov_iterator *iter;
197 struct gcov_info *info;
198 struct seq_file *seq;
199
200 seq = file->private_data;
201 iter = seq->private;
202 info = gcov_iter_get_info(iter);
203 gcov_iter_free(iter);
204 gcov_info_free(info);
205 seq_release(inode, file);
206
207 return 0;
208}
209
210/*
211 * Find a node by the associated data file name. Needs to be called with
212 * node_lock held.
213 */
214static struct gcov_node *get_node_by_name(const char *name)
215{
216 struct gcov_node *node;
217 struct gcov_info *info;
218
219 list_for_each_entry(node, &all_head, all) {
220 info = get_node_info(node);
221 if (info && (strcmp(info->filename, name) == 0))
222 return node;
223 }
224
225 return NULL;
226}
227
228static void remove_node(struct gcov_node *node);
229
230/*
231 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is
233 * a "ghost" node), remove the debug fs node as well.
234 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos)
237{
238 struct seq_file *seq;
239 struct gcov_info *info;
240 struct gcov_node *node;
241
242 seq = file->private_data;
243 info = gcov_iter_get_info(seq->private);
244 mutex_lock(&node_lock);
245 node = get_node_by_name(info->filename);
246 if (node) {
247 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost)
249 remove_node(node);
250 else
251 gcov_info_reset(node->info);
252 }
253 /* Reset counts for open file. */
254 gcov_info_reset(info);
255 mutex_unlock(&node_lock);
256
257 return len;
258}
259
260/*
261 * Given a string <path> representing a file path of format:
262 * path/to/file.gcda
263 * construct and return a new string:
264 * <dir/>path/to/file.<ext>
265 */
266static char *link_target(const char *dir, const char *path, const char *ext)
267{
268 char *target;
269 char *old_ext;
270 char *copy;
271
272 copy = kstrdup(path, GFP_KERNEL);
273 if (!copy)
274 return NULL;
275 old_ext = strrchr(copy, '.');
276 if (old_ext)
277 *old_ext = '\0';
278 if (dir)
279 target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
280 else
281 target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
282 kfree(copy);
283
284 return target;
285}
286
287/*
288 * Construct a string representing the symbolic link target for the given
289 * gcov data file name and link type. Depending on the link type and the
290 * location of the data file, the link target can either point to a
291 * subdirectory of srctree, objtree or in an external location.
292 */
293static char *get_link_target(const char *filename, const struct gcov_link *ext)
294{
295 const char *rel;
296 char *result;
297
298 if (strncmp(filename, objtree, strlen(objtree)) == 0) {
299 rel = filename + strlen(objtree) + 1;
300 if (ext->dir == SRC_TREE)
301 result = link_target(srctree, rel, ext->ext);
302 else
303 result = link_target(objtree, rel, ext->ext);
304 } else {
305 /* External compilation. */
306 result = link_target(NULL, filename, ext->ext);
307 }
308
309 return result;
310}
311
312#define SKEW_PREFIX ".tmp_"
313
314/*
315 * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
316 * for filename skewing caused by the mod-versioning mechanism.
317 */
318static const char *deskew(const char *basename)
319{
320 if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
321 return basename + sizeof(SKEW_PREFIX) - 1;
322 return basename;
323}
324
325/*
326 * Create links to additional files (usually .c and .gcno files) which the
327 * gcov tool expects to find in the same directory as the gcov data file.
328 */
329static void add_links(struct gcov_node *node, struct dentry *parent)
330{
331 char *basename;
332 char *target;
333 int num;
334 int i;
335
336 for (num = 0; gcov_link[num].ext; num++)
337 /* Nothing. */;
338 node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
339 if (!node->links)
340 return;
341 for (i = 0; i < num; i++) {
342 target = get_link_target(get_node_info(node)->filename,
343 &gcov_link[i]);
344 if (!target)
345 goto out_err;
346 basename = strrchr(target, '/');
347 if (!basename)
348 goto out_err;
349 basename++;
350 node->links[i] = debugfs_create_symlink(deskew(basename),
351 parent, target);
352 if (!node->links[i])
353 goto out_err;
354 kfree(target);
355 }
356
357 return;
358out_err:
359 kfree(target);
360 while (i-- > 0)
361 debugfs_remove(node->links[i]);
362 kfree(node->links);
363 node->links = NULL;
364}
365
366static const struct file_operations gcov_data_fops = {
367 .open = gcov_seq_open,
368 .release = gcov_seq_release,
369 .read = seq_read,
370 .llseek = seq_lseek,
371 .write = gcov_seq_write,
372};
373
374/* Basic initialization of a new node. */
375static void init_node(struct gcov_node *node, struct gcov_info *info,
376 const char *name, struct gcov_node *parent)
377{
378 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all);
381 node->info = info;
382 node->parent = parent;
383 if (name)
384 strcpy(node->name, name);
385}
386
387/*
388 * Create a new node and associated debugfs entry. Needs to be called with
389 * node_lock held.
390 */
391static struct gcov_node *new_node(struct gcov_node *parent,
392 struct gcov_info *info, const char *name)
393{
394 struct gcov_node *node;
395
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) {
398 pr_warning("out of memory\n");
399 return NULL;
400 }
401 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */
403 if (info) {
404 node->dentry = debugfs_create_file(deskew(node->name), 0600,
405 parent->dentry, node, &gcov_data_fops);
406 } else
407 node->dentry = debugfs_create_dir(node->name, parent->dentry);
408 if (!node->dentry) {
409 pr_warning("could not create file\n");
410 kfree(node);
411 return NULL;
412 }
413 if (info)
414 add_links(node, parent->dentry);
415 list_add(&node->list, &parent->children);
416 list_add(&node->all, &all_head);
417
418 return node;
419}
420
421/* Remove symbolic links associated with node. */
422static void remove_links(struct gcov_node *node)
423{
424 int i;
425
426 if (!node->links)
427 return;
428 for (i = 0; gcov_link[i].ext; i++)
429 debugfs_remove(node->links[i]);
430 kfree(node->links);
431 node->links = NULL;
432}
433
434/*
435 * Remove node from all lists and debugfs and release associated resources.
436 * Needs to be called with node_lock held.
437 */
438static void release_node(struct gcov_node *node)
439{
440 list_del(&node->list);
441 list_del(&node->all);
442 debugfs_remove(node->dentry);
443 remove_links(node);
444 if (node->ghost)
445 gcov_info_free(node->ghost);
446 kfree(node);
447}
448
449/* Release node and empty parents. Needs to be called with node_lock held. */
450static void remove_node(struct gcov_node *node)
451{
452 struct gcov_node *parent;
453
454 while ((node != &root_node) && list_empty(&node->children)) {
455 parent = node->parent;
456 release_node(node);
457 node = parent;
458 }
459}
460
461/*
462 * Find child node with given basename. Needs to be called with node_lock
463 * held.
464 */
465static struct gcov_node *get_child_by_name(struct gcov_node *parent,
466 const char *name)
467{
468 struct gcov_node *node;
469
470 list_for_each_entry(node, &parent->children, list) {
471 if (strcmp(node->name, name) == 0)
472 return node;
473 }
474
475 return NULL;
476}
477
478/*
479 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes.
481 */
482static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos)
484{
485 struct gcov_node *node;
486
487 mutex_lock(&node_lock);
488restart:
489 list_for_each_entry(node, &all_head, all) {
490 if (node->info)
491 gcov_info_reset(node->info);
492 else if (list_empty(&node->children)) {
493 remove_node(node);
494 /* Several nodes may have gone - restart loop. */
495 goto restart;
496 }
497 }
498 mutex_unlock(&node_lock);
499
500 return len;
501}
502
503/* read() implementation for reset file. Unused. */
504static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
505 loff_t *pos)
506{
507 /* Allow read operation so that a recursive copy won't fail. */
508 return 0;
509}
510
511static const struct file_operations gcov_reset_fops = {
512 .write = reset_write,
513 .read = reset_read,
514};
515
516/*
517 * Create a node for a given profiling data set and add it to all lists and
518 * debugfs. Needs to be called with node_lock held.
519 */
520static void add_node(struct gcov_info *info)
521{
522 char *filename;
523 char *curr;
524 char *next;
525 struct gcov_node *parent;
526 struct gcov_node *node;
527
528 filename = kstrdup(info->filename, GFP_KERNEL);
529 if (!filename)
530 return;
531 parent = &root_node;
532 /* Create directory nodes along the path. */
533 for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
534 if (curr == next)
535 continue;
536 *next = 0;
537 if (strcmp(curr, ".") == 0)
538 continue;
539 if (strcmp(curr, "..") == 0) {
540 if (!parent->parent)
541 goto err_remove;
542 parent = parent->parent;
543 continue;
544 }
545 node = get_child_by_name(parent, curr);
546 if (!node) {
547 node = new_node(parent, NULL, curr);
548 if (!node)
549 goto err_remove;
550 }
551 parent = node;
552 }
553 /* Create file node. */
554 node = new_node(parent, info, curr);
555 if (!node)
556 goto err_remove;
557out:
558 kfree(filename);
559 return;
560
561err_remove:
562 remove_node(parent);
563 goto out;
564}
565
566/*
567 * The profiling data set associated with this node is being unloaded. Store a
568 * copy of the profiling data and turn this node into a "ghost".
569 */
570static int ghost_node(struct gcov_node *node)
571{
572 node->ghost = gcov_info_dup(node->info);
573 if (!node->ghost) {
574 pr_warning("could not save data for '%s' (out of memory)\n",
575 node->info->filename);
576 return -ENOMEM;
577 }
578 node->info = NULL;
579
580 return 0;
581}
582
583/*
584 * Profiling data for this node has been loaded again. Add profiling data
585 * from previous instantiation and turn this node into a regular node.
586 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info)
588{
589 if (gcov_info_is_compatible(node->ghost, info))
590 gcov_info_add(info, node->ghost);
591 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n",
593 info->filename);
594 }
595 gcov_info_free(node->ghost);
596 node->ghost = NULL;
597 node->info = info;
598}
599
600/*
601 * Callback to create/remove profiling files when code compiled with
602 * -fprofile-arcs is loaded/unloaded.
603 */
604void gcov_event(enum gcov_action action, struct gcov_info *info)
605{
606 struct gcov_node *node;
607
608 mutex_lock(&node_lock);
609 node = get_node_by_name(info->filename);
610 switch (action) {
611 case GCOV_ADD:
612 /* Add new node or revive ghost. */
613 if (!node) {
614 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break;
624 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */
626 if (!node) {
627 pr_warning("could not remove '%s' (not found)\n",
628 info->filename);
629 break;
630 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break;
637 }
638 mutex_unlock(&node_lock);
639}
640
641/* Create debugfs entries. */
642static __init int gcov_fs_init(void)
643{
644 int rc = -EIO;
645
646 init_node(&root_node, NULL, NULL, NULL);
647 /*
648 * /sys/kernel/debug/gcov will be parent for the reset control file
649 * and all profiling files.
650 */
651 root_node.dentry = debugfs_create_dir("gcov", NULL);
652 if (!root_node.dentry)
653 goto err_remove;
654 /*
655 * Create reset file which resets all profiling counts when written
656 * to.
657 */
658 reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
659 NULL, &gcov_reset_fops);
660 if (!reset_dentry)
661 goto err_remove;
662 /* Replay previous events to get our fs hierarchy up-to-date. */
663 gcov_enable_events();
664 return 0;
665
666err_remove:
667 pr_err("init failed\n");
668 if (root_node.dentry)
669 debugfs_remove(root_node.dentry);
670
671 return rc;
672}
673device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000000..ae5bb4260033
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 3.4. Future versions of gcc may change the gcov
4 * format (as happened before), so all format-specific information needs
5 * to be kept modular and easily exchangeable.
6 *
7 * This file is based on gcc-internal definitions. Functions and data
8 * structures are defined to be compatible with gcc counterparts.
9 * For a better understanding, refer to gcc source: gcc/gcov-io.h.
10 *
11 * Copyright IBM Corp. 2009
12 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 *
14 * Uses gcc-internal data definitions.
15 */
16
17#include <linux/errno.h>
18#include <linux/slab.h>
19#include <linux/string.h>
20#include <linux/seq_file.h>
21#include <linux/vmalloc.h>
22#include "gcov.h"
23
24/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
27 { 0, NULL},
28};
29
30/*
31 * Determine whether a counter is active. Based on gcc magic. Doesn't change
32 * at run-time.
33 */
34static int counter_active(struct gcov_info *info, unsigned int type)
35{
36 return (1 << type) & info->ctr_mask;
37}
38
39/* Determine number of active counters. Based on gcc magic. */
40static unsigned int num_counter_active(struct gcov_info *info)
41{
42 unsigned int i;
43 unsigned int result = 0;
44
45 for (i = 0; i < GCOV_COUNTERS; i++) {
46 if (counter_active(info, i))
47 result++;
48 }
49 return result;
50}
51
52/**
53 * gcov_info_reset - reset profiling data to zero
54 * @info: profiling data set
55 */
56void gcov_info_reset(struct gcov_info *info)
57{
58 unsigned int active = num_counter_active(info);
59 unsigned int i;
60
61 for (i = 0; i < active; i++) {
62 memset(info->counts[i].values, 0,
63 info->counts[i].num * sizeof(gcov_type));
64 }
65}
66
67/**
68 * gcov_info_is_compatible - check if profiling data can be added
69 * @info1: first profiling data set
70 * @info2: second profiling data set
71 *
72 * Returns non-zero if profiling data can be added, zero otherwise.
73 */
74int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
75{
76 return (info1->stamp == info2->stamp);
77}
78
79/**
80 * gcov_info_add - add up profiling data
81 * @dest: profiling data set to which data is added
82 * @source: profiling data set which is added
83 *
84 * Adds profiling counts of @source to @dest.
85 */
86void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
87{
88 unsigned int i;
89 unsigned int j;
90
91 for (i = 0; i < num_counter_active(dest); i++) {
92 for (j = 0; j < dest->counts[i].num; j++) {
93 dest->counts[i].values[j] +=
94 source->counts[i].values[j];
95 }
96 }
97}
98
99/* Get size of function info entry. Based on gcc magic. */
100static size_t get_fn_size(struct gcov_info *info)
101{
102 size_t size;
103
104 size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
105 sizeof(unsigned int);
106 if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
107 size = ALIGN(size, __alignof__(struct gcov_fn_info));
108 return size;
109}
110
111/* Get address of function info entry. Based on gcc magic. */
112static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
113{
114 return (struct gcov_fn_info *)
115 ((char *) info->functions + fn * get_fn_size(info));
116}
117
118/**
119 * gcov_info_dup - duplicate profiling data set
120 * @info: profiling data set to duplicate
121 *
122 * Return newly allocated duplicate on success, %NULL on error.
123 */
124struct gcov_info *gcov_info_dup(struct gcov_info *info)
125{
126 struct gcov_info *dup;
127 unsigned int i;
128 unsigned int active;
129
130 /* Duplicate gcov_info. */
131 active = num_counter_active(info);
132 dup = kzalloc(sizeof(struct gcov_info) +
133 sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
134 if (!dup)
135 return NULL;
136 dup->version = info->version;
137 dup->stamp = info->stamp;
138 dup->n_functions = info->n_functions;
139 dup->ctr_mask = info->ctr_mask;
140 /* Duplicate filename. */
141 dup->filename = kstrdup(info->filename, GFP_KERNEL);
142 if (!dup->filename)
143 goto err_free;
144 /* Duplicate table of functions. */
145 dup->functions = kmemdup(info->functions, info->n_functions *
146 get_fn_size(info), GFP_KERNEL);
147 if (!dup->functions)
148 goto err_free;
149 /* Duplicate counter arrays. */
150 for (i = 0; i < active ; i++) {
151 struct gcov_ctr_info *ctr = &info->counts[i];
152 size_t size = ctr->num * sizeof(gcov_type);
153
154 dup->counts[i].num = ctr->num;
155 dup->counts[i].merge = ctr->merge;
156 dup->counts[i].values = vmalloc(size);
157 if (!dup->counts[i].values)
158 goto err_free;
159 memcpy(dup->counts[i].values, ctr->values, size);
160 }
161 return dup;
162
163err_free:
164 gcov_info_free(dup);
165 return NULL;
166}
167
168/**
169 * gcov_info_free - release memory for profiling data set duplicate
170 * @info: profiling data set duplicate to free
171 */
172void gcov_info_free(struct gcov_info *info)
173{
174 unsigned int active = num_counter_active(info);
175 unsigned int i;
176
177 for (i = 0; i < active ; i++)
178 vfree(info->counts[i].values);
179 kfree(info->functions);
180 kfree(info->filename);
181 kfree(info);
182}
183
184/**
185 * struct type_info - iterator helper array
186 * @ctr_type: counter type
187 * @offset: index of the first value of the current function for this type
188 *
189 * This array is needed to convert the in-memory data format into the in-file
190 * data format:
191 *
192 * In-memory:
193 * for each counter type
194 * for each function
195 * values
196 *
197 * In-file:
198 * for each function
199 * for each counter type
200 * values
201 *
202 * See gcc source gcc/gcov-io.h for more information on data organization.
203 */
204struct type_info {
205 int ctr_type;
206 unsigned int offset;
207};
208
209/**
210 * struct gcov_iterator - specifies current file position in logical records
211 * @info: associated profiling data
212 * @record: record type
213 * @function: function number
214 * @type: counter type
215 * @count: index into values array
216 * @num_types: number of counter types
217 * @type_info: helper array to get values-array offset for current function
218 */
219struct gcov_iterator {
220 struct gcov_info *info;
221
222 int record;
223 unsigned int function;
224 unsigned int type;
225 unsigned int count;
226
227 int num_types;
228 struct type_info type_info[0];
229};
230
231static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
232{
233 return get_fn_info(iter->info, iter->function);
234}
235
236static struct type_info *get_type(struct gcov_iterator *iter)
237{
238 return &iter->type_info[iter->type];
239}
240
241/**
242 * gcov_iter_new - allocate and initialize profiling data iterator
243 * @info: profiling data set to be iterated
244 *
245 * Return file iterator on success, %NULL otherwise.
246 */
247struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
248{
249 struct gcov_iterator *iter;
250
251 iter = kzalloc(sizeof(struct gcov_iterator) +
252 num_counter_active(info) * sizeof(struct type_info),
253 GFP_KERNEL);
254 if (iter)
255 iter->info = info;
256
257 return iter;
258}
259
260/**
261 * gcov_iter_free - release memory for iterator
262 * @iter: file iterator to free
263 */
264void gcov_iter_free(struct gcov_iterator *iter)
265{
266 kfree(iter);
267}
268
269/**
270 * gcov_iter_get_info - return profiling data set for given file iterator
271 * @iter: file iterator
272 */
273struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
274{
275 return iter->info;
276}
277
278/**
279 * gcov_iter_start - reset file iterator to starting position
280 * @iter: file iterator
281 */
282void gcov_iter_start(struct gcov_iterator *iter)
283{
284 int i;
285
286 iter->record = 0;
287 iter->function = 0;
288 iter->type = 0;
289 iter->count = 0;
290 iter->num_types = 0;
291 for (i = 0; i < GCOV_COUNTERS; i++) {
292 if (counter_active(iter->info, i)) {
293 iter->type_info[iter->num_types].ctr_type = i;
294 iter->type_info[iter->num_types++].offset = 0;
295 }
296 }
297}
298
299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6
307#define RECORD_COUNT_TAG 7
308#define RECORD_COUNT_LEN 8
309#define RECORD_COUNT 9
310
311/**
312 * gcov_iter_next - advance file iterator to next logical record
313 * @iter: file iterator
314 *
315 * Return zero if new position is valid, non-zero if iterator has reached end.
316 */
317int gcov_iter_next(struct gcov_iterator *iter)
318{
319 switch (iter->record) {
320 case RECORD_FILE_MAGIC:
321 case RECORD_GCOV_VERSION:
322 case RECORD_FUNCTION_TAG:
323 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG:
326 /* Advance to next record */
327 iter->record++;
328 break;
329 case RECORD_COUNT:
330 /* Advance to next count */
331 iter->count++;
332 /* fall through */
333 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9;
336 break;
337 }
338 /* Advance to next counter type */
339 get_type(iter)->offset += iter->count;
340 iter->count = 0;
341 iter->type++;
342 /* fall through */
343 case RECORD_FUNCTION_CHECK:
344 if (iter->type < iter->num_types) {
345 iter->record = 7;
346 break;
347 }
348 /* Advance to next function */
349 iter->type = 0;
350 iter->function++;
351 /* fall through */
352 case RECORD_TIME_STAMP:
353 if (iter->function < iter->info->n_functions)
354 iter->record = 3;
355 else
356 iter->record = -1;
357 break;
358 }
359 /* Check for EOF. */
360 if (iter->record == -1)
361 return -EINVAL;
362 else
363 return 0;
364}
365
366/**
367 * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
368 * @seq: seq_file handle
369 * @v: value to be stored
370 *
371 * Number format defined by gcc: numbers are recorded in the 32 bit
372 * unsigned binary form of the endianness of the machine generating the
373 * file.
374 */
375static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
376{
377 return seq_write(seq, &v, sizeof(v));
378}
379
380/**
381 * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
382 * @seq: seq_file handle
383 * @v: value to be stored
384 *
385 * Number format defined by gcc: numbers are recorded in the 32 bit
386 * unsigned binary form of the endianness of the machine generating the
387 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
388 * first.
389 */
390static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
391{
392 u32 data[2];
393
394 data[0] = (v & 0xffffffffUL);
395 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data));
397}
398
399/**
400 * gcov_iter_write - write data for current pos to seq_file
401 * @iter: file iterator
402 * @seq: seq_file handle
403 *
404 * Return zero on success, non-zero otherwise.
405 */
406int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
407{
408 int rc = -EINVAL;
409
410 switch (iter->record) {
411 case RECORD_FILE_MAGIC:
412 rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
413 break;
414 case RECORD_GCOV_VERSION:
415 rc = seq_write_gcov_u32(seq, iter->info->version);
416 break;
417 case RECORD_TIME_STAMP:
418 rc = seq_write_gcov_u32(seq, iter->info->stamp);
419 break;
420 case RECORD_FUNCTION_TAG:
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break;
423 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2);
425 break;
426 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break;
429 case RECORD_FUNCTION_CHECK:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
431 break;
432 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq,
434 GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
435 break;
436 case RECORD_COUNT_LEN:
437 rc = seq_write_gcov_u32(seq,
438 get_func(iter)->n_ctrs[iter->type] * 2);
439 break;
440 case RECORD_COUNT:
441 rc = seq_write_gcov_u64(seq,
442 iter->info->counts[iter->type].
443 values[iter->count + get_type(iter)->offset]);
444 break;
445 }
446 return rc;
447}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000000..060073ebf7a6
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
1/*
2 * Profiling infrastructure declarations.
3 *
4 * This file is based on gcc-internal definitions. Data structures are
5 * defined to be compatible with gcc counterparts. For a better
6 * understanding, refer to gcc source: gcc/gcov-io.h.
7 *
8 * Copyright IBM Corp. 2009
9 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#ifndef GCOV_H
15#define GCOV_H GCOV_H
16
17#include <linux/types.h>
18
19/*
20 * Profiling data types used for gcc 3.4 and above - these are defined by
21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible.
23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
30
31#if BITS_PER_LONG >= 64
32typedef long gcov_type;
33#else
34typedef long long gcov_type;
35#endif
36
37/**
38 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier
40 * @checksum: function checksum
41 * @n_ctrs: number of values per counter type belonging to this function
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66
67/**
68 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation
70 * @next: list head for a singly-linked list
71 * @stamp: time stamp
72 * @filename: name of the associated gcov data file
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91
92/* Base interface. */
93enum gcov_action {
94 GCOV_ADD,
95 GCOV_REMOVE,
96};
97
98void gcov_event(enum gcov_action action, struct gcov_info *info);
99void gcov_enable_events(void);
100
101/* Iterator control. */
102struct seq_file;
103struct gcov_iterator;
104
105struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
106void gcov_iter_free(struct gcov_iterator *iter);
107void gcov_iter_start(struct gcov_iterator *iter);
108int gcov_iter_next(struct gcov_iterator *iter);
109int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
110struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
111
112/* gcov_info control. */
113void gcov_info_reset(struct gcov_info *info);
114int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
115void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
116struct gcov_info *gcov_info_dup(struct gcov_info *info);
117void gcov_info_free(struct gcov_info *info);
118
119struct gcov_link {
120 enum {
121 OBJ_TREE,
122 SRC_TREE,
123 } dir;
124 const char *ext;
125};
126extern const struct gcov_link gcov_link[];
127
128#endif /* GCOV_H */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b675a67c9ac3..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -380,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
380 return res; 386 return res;
381} 387}
382 388
389EXPORT_SYMBOL_GPL(ktime_add_safe);
390
383#ifdef CONFIG_DEBUG_OBJECTS_TIMERS 391#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
384 392
385static struct debug_obj_descr hrtimer_debug_descr; 393static struct debug_obj_descr hrtimer_debug_descr;
@@ -1274,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1274 1282
1275 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1276 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1277 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1278 1296
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1298 ktime_t basenow;
1281 struct rb_node *node; 1299 struct rb_node *node;
1282 1300
1283 spin_lock(&cpu_base->lock);
1284
1285 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1286 1302
1287 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1314,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1314 1330
1315 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1316 } 1332 }
1317 spin_unlock(&cpu_base->lock);
1318 base++; 1333 base++;
1319 } 1334 }
1320 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1321 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1322 1342
1323 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1324 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aaf5c9d05770..50da67672901 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -856,7 +856,7 @@ EXPORT_SYMBOL(free_irq);
856 * still called in hard interrupt context and has to check 856 * still called in hard interrupt context and has to check
857 * whether the interrupt originates from the device. If yes it 857 * whether the interrupt originates from the device. If yes it
858 * needs to disable the interrupt on the device and return 858 * needs to disable the interrupt on the device and return
859 * IRQ_THREAD_WAKE which will wake up the handler thread and run 859 * IRQ_WAKE_THREAD which will wake up the handler thread and run
860 * @thread_fn. This split handler design is necessary to support 860 * @thread_fn. This split handler design is necessary to support
861 * shared interrupts. 861 * shared interrupts.
862 * 862 *
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..16b5739c516a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
237{ 237{
238 struct kprobe_insn_page *kip; 238 struct kprobe_insn_page *kip;
239 struct hlist_node *pos, *next; 239 struct hlist_node *pos, *next;
240 int safety;
241 240
242 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 242 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 243 return -EAGAIN;
248 244
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7fa441333529..9b1a7de26979 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,7 +27,6 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 struct completion started;
31 30
32 /* Result passed back to kthread_create() from kthreadd. */ 31 /* Result passed back to kthread_create() from kthreadd. */
33 struct task_struct *result; 32 struct task_struct *result;
@@ -36,17 +35,13 @@ struct kthread_create_info
36 struct list_head list; 35 struct list_head list;
37}; 36};
38 37
39struct kthread_stop_info 38struct kthread {
40{ 39 int should_stop;
41 struct task_struct *k; 40 struct completion exited;
42 int err;
43 struct completion done;
44}; 41};
45 42
46/* Thread stopping is done by setthing this var: lock serializes 43#define to_kthread(tsk) \
47 * multiple kthread_stop calls. */ 44 container_of((tsk)->vfork_done, struct kthread, exited)
48static DEFINE_MUTEX(kthread_stop_lock);
49static struct kthread_stop_info kthread_stop_info;
50 45
51/** 46/**
52 * kthread_should_stop - should this kthread return now? 47 * kthread_should_stop - should this kthread return now?
@@ -57,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
57 */ 52 */
58int kthread_should_stop(void) 53int kthread_should_stop(void)
59{ 54{
60 return (kthread_stop_info.k == current); 55 return to_kthread(current)->should_stop;
61} 56}
62EXPORT_SYMBOL(kthread_should_stop); 57EXPORT_SYMBOL(kthread_should_stop);
63 58
64static int kthread(void *_create) 59static int kthread(void *_create)
65{ 60{
61 /* Copy data: it's on kthread's stack */
66 struct kthread_create_info *create = _create; 62 struct kthread_create_info *create = _create;
67 int (*threadfn)(void *data); 63 int (*threadfn)(void *data) = create->threadfn;
68 void *data; 64 void *data = create->data;
69 int ret = -EINTR; 65 struct kthread self;
66 int ret;
70 67
71 /* Copy data: it's on kthread's stack */ 68 self.should_stop = 0;
72 threadfn = create->threadfn; 69 init_completion(&self.exited);
73 data = create->data; 70 current->vfork_done = &self.exited;
74 71
75 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
76 __set_current_state(TASK_UNINTERRUPTIBLE); 73 __set_current_state(TASK_UNINTERRUPTIBLE);
77 create->result = current; 74 create->result = current;
78 complete(&create->started); 75 complete(&create->done);
79 schedule(); 76 schedule();
80 77
81 if (!kthread_should_stop()) 78 ret = -EINTR;
79 if (!self.should_stop)
82 ret = threadfn(data); 80 ret = threadfn(data);
83 81
84 /* It might have exited on its own, w/o kthread_stop. Check. */ 82 /* we can't just return, we must preserve "self" on stack */
85 if (kthread_should_stop()) { 83 do_exit(ret);
86 kthread_stop_info.err = ret;
87 complete(&kthread_stop_info.done);
88 }
89 return 0;
90} 84}
91 85
92static void create_kthread(struct kthread_create_info *create) 86static void create_kthread(struct kthread_create_info *create)
@@ -95,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
95 89
96 /* We want our own signal handler (we take no signals by default). */ 90 /* We want our own signal handler (we take no signals by default). */
97 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 91 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
98 if (pid < 0) 92 if (pid < 0) {
99 create->result = ERR_PTR(pid); 93 create->result = ERR_PTR(pid);
100 else 94 complete(&create->done);
101 wait_for_completion(&create->started); 95 }
102 complete(&create->done);
103} 96}
104 97
105/** 98/**
@@ -130,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
130 123
131 create.threadfn = threadfn; 124 create.threadfn = threadfn;
132 create.data = data; 125 create.data = data;
133 init_completion(&create.started);
134 init_completion(&create.done); 126 init_completion(&create.done);
135 127
136 spin_lock(&kthread_create_lock); 128 spin_lock(&kthread_create_lock);
@@ -198,30 +190,22 @@ EXPORT_SYMBOL(kthread_bind);
198 */ 190 */
199int kthread_stop(struct task_struct *k) 191int kthread_stop(struct task_struct *k)
200{ 192{
193 struct kthread *kthread;
201 int ret; 194 int ret;
202 195
203 mutex_lock(&kthread_stop_lock);
204
205 /* It could exit after stop_info.k set, but before wake_up_process. */
206 get_task_struct(k);
207
208 trace_sched_kthread_stop(k); 196 trace_sched_kthread_stop(k);
197 get_task_struct(k);
209 198
210 /* Must init completion *before* thread sees kthread_stop_info.k */ 199 kthread = to_kthread(k);
211 init_completion(&kthread_stop_info.done); 200 barrier(); /* it might have exited */
212 smp_wmb(); 201 if (k->vfork_done != NULL) {
202 kthread->should_stop = 1;
203 wake_up_process(k);
204 wait_for_completion(&kthread->exited);
205 }
206 ret = k->exit_code;
213 207
214 /* Now set kthread_should_stop() to true, and wake it up. */
215 kthread_stop_info.k = k;
216 wake_up_process(k);
217 put_task_struct(k); 208 put_task_struct(k);
218
219 /* Once it dies, reset stop ptr, gather result and we're done. */
220 wait_for_completion(&kthread_stop_info.done);
221 kthread_stop_info.k = NULL;
222 ret = kthread_stop_info.err;
223 mutex_unlock(&kthread_stop_lock);
224
225 trace_sched_kthread_stop_ret(ret); 209 trace_sched_kthread_stop_ret(ret);
226 210
227 return ret; 211 return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 215aaab09e91..0a049837008e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2216,6 +2216,10 @@ static noinline struct module *load_module(void __user *umod,
2216 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, 2216 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2217 "__kcrctab_unused_gpl"); 2217 "__kcrctab_unused_gpl");
2218#endif 2218#endif
2219#ifdef CONFIG_CONSTRUCTORS
2220 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2221 sizeof(*mod->ctors), &mod->num_ctors);
2222#endif
2219 2223
2220#ifdef CONFIG_MARKERS 2224#ifdef CONFIG_MARKERS
2221 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", 2225 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2389,6 +2393,17 @@ static noinline struct module *load_module(void __user *umod,
2389 goto free_hdr; 2393 goto free_hdr;
2390} 2394}
2391 2395
2396/* Call module constructors. */
2397static void do_mod_ctors(struct module *mod)
2398{
2399#ifdef CONFIG_CONSTRUCTORS
2400 unsigned long i;
2401
2402 for (i = 0; i < mod->num_ctors; i++)
2403 mod->ctors[i]();
2404#endif
2405}
2406
2392/* This is where the real work happens */ 2407/* This is where the real work happens */
2393SYSCALL_DEFINE3(init_module, void __user *, umod, 2408SYSCALL_DEFINE3(init_module, void __user *, umod,
2394 unsigned long, len, const char __user *, uargs) 2409 unsigned long, len, const char __user *, uargs)
@@ -2417,6 +2432,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2417 blocking_notifier_call_chain(&module_notify_list, 2432 blocking_notifier_call_chain(&module_notify_list,
2418 MODULE_STATE_COMING, mod); 2433 MODULE_STATE_COMING, mod);
2419 2434
2435 do_mod_ctors(mod);
2420 /* Start the module */ 2436 /* Start the module */
2421 if (mod->init != NULL) 2437 if (mod->init != NULL)
2422 ret = do_one_initcall(mod->init); 2438 ret = do_one_initcall(mod->init);
@@ -2435,9 +2451,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2435 return ret; 2451 return ret;
2436 } 2452 }
2437 if (ret > 0) { 2453 if (ret > 0) {
2438 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2454 printk(KERN_WARNING
2439 "it should follow 0/-E convention\n" 2455"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2440 KERN_WARNING "%s: loading module anyway...\n", 2456"%s: loading module anyway...\n",
2441 __func__, mod->name, ret, 2457 __func__, mod->name, ret,
2442 __func__); 2458 __func__);
2443 dump_stack(); 2459 dump_stack();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0c..09b4ff9711b2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29/* 29static inline struct nsproxy *create_nsproxy(void)
30 * creates a copy of "orig" with refcount 1.
31 */
32static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
33{ 30{
34 struct nsproxy *ns; 31 struct nsproxy *nsproxy;
35 32
36 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); 33 nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
37 if (ns) { 34 if (nsproxy)
38 memcpy(ns, orig, sizeof(struct nsproxy)); 35 atomic_set(&nsproxy->count, 1);
39 atomic_set(&ns->count, 1); 36 return nsproxy;
40 }
41 return ns;
42} 37}
43 38
44/* 39/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
52 struct nsproxy *new_nsp; 47 struct nsproxy *new_nsp;
53 int err; 48 int err;
54 49
55 new_nsp = clone_nsproxy(tsk->nsproxy); 50 new_nsp = create_nsproxy();
56 if (!new_nsp) 51 if (!new_nsp)
57 return ERR_PTR(-ENOMEM); 52 return ERR_PTR(-ENOMEM);
58 53
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..a641eb753b8c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)
124 124
125static void get_ctx(struct perf_counter_context *ctx) 125static void get_ctx(struct perf_counter_context *ctx)
126{ 126{
127 atomic_inc(&ctx->refcount); 127 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128} 128}
129 129
130static void free_ctx(struct rcu_head *head) 130static void free_ctx(struct rcu_head *head)
@@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
175 spin_unlock_irqrestore(&ctx->lock, *flags); 175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry; 176 goto retry;
177 } 177 }
178
179 if (!atomic_inc_not_zero(&ctx->refcount)) {
180 spin_unlock_irqrestore(&ctx->lock, *flags);
181 ctx = NULL;
182 }
178 } 183 }
179 rcu_read_unlock(); 184 rcu_read_unlock();
180 return ctx; 185 return ctx;
@@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
193 ctx = perf_lock_task_context(task, &flags); 198 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) { 199 if (ctx) {
195 ++ctx->pin_count; 200 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags); 201 spin_unlock_irqrestore(&ctx->lock, flags);
198 } 202 }
199 return ctx; 203 return ctx;
@@ -232,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
232 236
233 list_add_rcu(&counter->event_entry, &ctx->event_list); 237 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++; 238 ctx->nr_counters++;
239 if (counter->attr.inherit_stat)
240 ctx->nr_stat++;
235} 241}
236 242
237/* 243/*
@@ -246,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246 if (list_empty(&counter->list_entry)) 252 if (list_empty(&counter->list_entry))
247 return; 253 return;
248 ctx->nr_counters--; 254 ctx->nr_counters--;
255 if (counter->attr.inherit_stat)
256 ctx->nr_stat--;
249 257
250 list_del_init(&counter->list_entry); 258 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry); 259 list_del_rcu(&counter->event_entry);
@@ -1002,6 +1010,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1002 && !ctx1->pin_count && !ctx2->pin_count; 1010 && !ctx1->pin_count && !ctx2->pin_count;
1003} 1011}
1004 1012
1013static void __perf_counter_read(void *counter);
1014
1015static void __perf_counter_sync_stat(struct perf_counter *counter,
1016 struct perf_counter *next_counter)
1017{
1018 u64 value;
1019
1020 if (!counter->attr.inherit_stat)
1021 return;
1022
1023 /*
1024 * Update the counter value, we cannot use perf_counter_read()
1025 * because we're in the middle of a context switch and have IRQs
1026 * disabled, which upsets smp_call_function_single(), however
1027 * we know the counter must be on the current CPU, therefore we
1028 * don't need to use it.
1029 */
1030 switch (counter->state) {
1031 case PERF_COUNTER_STATE_ACTIVE:
1032 __perf_counter_read(counter);
1033 break;
1034
1035 case PERF_COUNTER_STATE_INACTIVE:
1036 update_counter_times(counter);
1037 break;
1038
1039 default:
1040 break;
1041 }
1042
1043 /*
1044 * In order to keep per-task stats reliable we need to flip the counter
1045 * values when we flip the contexts.
1046 */
1047 value = atomic64_read(&next_counter->count);
1048 value = atomic64_xchg(&counter->count, value);
1049 atomic64_set(&next_counter->count, value);
1050
1051 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1052 swap(counter->total_time_running, next_counter->total_time_running);
1053
1054 /*
1055 * Since we swizzled the values, update the user visible data too.
1056 */
1057 perf_counter_update_userpage(counter);
1058 perf_counter_update_userpage(next_counter);
1059}
1060
1061#define list_next_entry(pos, member) \
1062 list_entry(pos->member.next, typeof(*pos), member)
1063
1064static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1065 struct perf_counter_context *next_ctx)
1066{
1067 struct perf_counter *counter, *next_counter;
1068
1069 if (!ctx->nr_stat)
1070 return;
1071
1072 counter = list_first_entry(&ctx->event_list,
1073 struct perf_counter, event_entry);
1074
1075 next_counter = list_first_entry(&next_ctx->event_list,
1076 struct perf_counter, event_entry);
1077
1078 while (&counter->event_entry != &ctx->event_list &&
1079 &next_counter->event_entry != &next_ctx->event_list) {
1080
1081 __perf_counter_sync_stat(counter, next_counter);
1082
1083 counter = list_next_entry(counter, event_entry);
1084 next_counter = list_next_entry(counter, event_entry);
1085 }
1086}
1087
1005/* 1088/*
1006 * Called from scheduler to remove the counters of the current task, 1089 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled. 1090 * with interrupts disabled.
@@ -1057,6 +1140,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1057 ctx->task = next; 1140 ctx->task = next;
1058 next_ctx->task = task; 1141 next_ctx->task = task;
1059 do_switch = 0; 1142 do_switch = 0;
1143
1144 perf_counter_sync_stat(ctx, next_ctx);
1060 } 1145 }
1061 spin_unlock(&next_ctx->lock); 1146 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock); 1147 spin_unlock(&ctx->lock);
@@ -1283,7 +1368,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1283 if (!interrupts) { 1368 if (!interrupts) {
1284 perf_disable(); 1369 perf_disable();
1285 counter->pmu->disable(counter); 1370 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0); 1371 atomic64_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter); 1372 counter->pmu->enable(counter);
1288 perf_enable(); 1373 perf_enable();
1289 } 1374 }
@@ -1344,9 +1429,56 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1344} 1429}
1345 1430
1346/* 1431/*
1432 * Enable all of a task's counters that have been marked enable-on-exec.
1433 * This expects task == current.
1434 */
1435static void perf_counter_enable_on_exec(struct task_struct *task)
1436{
1437 struct perf_counter_context *ctx;
1438 struct perf_counter *counter;
1439 unsigned long flags;
1440 int enabled = 0;
1441
1442 local_irq_save(flags);
1443 ctx = task->perf_counter_ctxp;
1444 if (!ctx || !ctx->nr_counters)
1445 goto out;
1446
1447 __perf_counter_task_sched_out(ctx);
1448
1449 spin_lock(&ctx->lock);
1450
1451 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1452 if (!counter->attr.enable_on_exec)
1453 continue;
1454 counter->attr.enable_on_exec = 0;
1455 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1456 continue;
1457 counter->state = PERF_COUNTER_STATE_INACTIVE;
1458 counter->tstamp_enabled =
1459 ctx->time - counter->total_time_enabled;
1460 enabled = 1;
1461 }
1462
1463 /*
1464 * Unclone this context if we enabled any counter.
1465 */
1466 if (enabled && ctx->parent_ctx) {
1467 put_ctx(ctx->parent_ctx);
1468 ctx->parent_ctx = NULL;
1469 }
1470
1471 spin_unlock(&ctx->lock);
1472
1473 perf_counter_task_sched_in(task, smp_processor_id());
1474 out:
1475 local_irq_restore(flags);
1476}
1477
1478/*
1347 * Cross CPU call to read the hardware counter 1479 * Cross CPU call to read the hardware counter
1348 */ 1480 */
1349static void __read(void *info) 1481static void __perf_counter_read(void *info)
1350{ 1482{
1351 struct perf_counter *counter = info; 1483 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx; 1484 struct perf_counter_context *ctx = counter->ctx;
@@ -1368,7 +1500,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1368 */ 1500 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1501 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu, 1502 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1); 1503 __perf_counter_read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1504 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter); 1505 update_counter_times(counter);
1374 } 1506 }
@@ -1459,11 +1591,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1459 put_ctx(parent_ctx); 1591 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */ 1592 ctx->parent_ctx = NULL; /* no longer a clone */
1461 } 1593 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags); 1594 spin_unlock_irqrestore(&ctx->lock, flags);
1468 } 1595 }
1469 1596
@@ -1509,11 +1636,13 @@ static void free_counter(struct perf_counter *counter)
1509{ 1636{
1510 perf_pending_sync(counter); 1637 perf_pending_sync(counter);
1511 1638
1512 atomic_dec(&nr_counters); 1639 if (!counter->parent) {
1513 if (counter->attr.mmap) 1640 atomic_dec(&nr_counters);
1514 atomic_dec(&nr_mmap_counters); 1641 if (counter->attr.mmap)
1515 if (counter->attr.comm) 1642 atomic_dec(&nr_mmap_counters);
1516 atomic_dec(&nr_comm_counters); 1643 if (counter->attr.comm)
1644 atomic_dec(&nr_comm_counters);
1645 }
1517 1646
1518 if (counter->destroy) 1647 if (counter->destroy)
1519 counter->destroy(counter); 1648 counter->destroy(counter);
@@ -1553,7 +1682,7 @@ static int perf_release(struct inode *inode, struct file *file)
1553static ssize_t 1682static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1683perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{ 1684{
1556 u64 values[3]; 1685 u64 values[4];
1557 int n; 1686 int n;
1558 1687
1559 /* 1688 /*
@@ -1620,22 +1749,6 @@ static void perf_counter_reset(struct perf_counter *counter)
1620 perf_counter_update_userpage(counter); 1749 perf_counter_update_userpage(counter);
1621} 1750}
1622 1751
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/* 1752/*
1640 * Holding the top-level counter's child_mutex means that any 1753 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block 1754 * descendant process that has inherited this counter will block
@@ -1658,14 +1771,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
1658static void perf_counter_for_each(struct perf_counter *counter, 1771static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *)) 1772 void (*func)(struct perf_counter *))
1660{ 1773{
1661 struct perf_counter *child; 1774 struct perf_counter_context *ctx = counter->ctx;
1775 struct perf_counter *sibling;
1662 1776
1663 WARN_ON_ONCE(counter->ctx->parent_ctx); 1777 WARN_ON_ONCE(ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex); 1778 mutex_lock(&ctx->mutex);
1665 perf_counter_for_each_sibling(counter, func); 1779 counter = counter->group_leader;
1666 list_for_each_entry(child, &counter->child_list, child_list) 1780
1667 perf_counter_for_each_sibling(child, func); 1781 perf_counter_for_each_child(counter, func);
1668 mutex_unlock(&counter->child_mutex); 1782 func(counter);
1783 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1784 perf_counter_for_each_child(counter, func);
1785 mutex_unlock(&ctx->mutex);
1669} 1786}
1670 1787
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) 1788static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1764,6 +1881,14 @@ int perf_counter_task_disable(void)
1764 return 0; 1881 return 0;
1765} 1882}
1766 1883
1884static int perf_counter_index(struct perf_counter *counter)
1885{
1886 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1887 return 0;
1888
1889 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
1890}
1891
1767/* 1892/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise 1893 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch 1894 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1788,11 +1913,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1788 preempt_disable(); 1913 preempt_disable();
1789 ++userpg->lock; 1914 ++userpg->lock;
1790 barrier(); 1915 barrier();
1791 userpg->index = counter->hw.idx; 1916 userpg->index = perf_counter_index(counter);
1792 userpg->offset = atomic64_read(&counter->count); 1917 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1918 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count); 1919 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795 1920
1921 userpg->time_enabled = counter->total_time_enabled +
1922 atomic64_read(&counter->child_total_time_enabled);
1923
1924 userpg->time_running = counter->total_time_running +
1925 atomic64_read(&counter->child_total_time_running);
1926
1796 barrier(); 1927 barrier();
1797 ++userpg->lock; 1928 ++userpg->lock;
1798 preempt_enable(); 1929 preempt_enable();
@@ -1806,6 +1937,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1806 struct perf_mmap_data *data; 1937 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS; 1938 int ret = VM_FAULT_SIGBUS;
1808 1939
1940 if (vmf->flags & FAULT_FLAG_MKWRITE) {
1941 if (vmf->pgoff == 0)
1942 ret = 0;
1943 return ret;
1944 }
1945
1809 rcu_read_lock(); 1946 rcu_read_lock();
1810 data = rcu_dereference(counter->data); 1947 data = rcu_dereference(counter->data);
1811 if (!data) 1948 if (!data)
@@ -1819,9 +1956,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1819 if ((unsigned)nr > data->nr_pages) 1956 if ((unsigned)nr > data->nr_pages)
1820 goto unlock; 1957 goto unlock;
1821 1958
1959 if (vmf->flags & FAULT_FLAG_WRITE)
1960 goto unlock;
1961
1822 vmf->page = virt_to_page(data->data_pages[nr]); 1962 vmf->page = virt_to_page(data->data_pages[nr]);
1823 } 1963 }
1964
1824 get_page(vmf->page); 1965 get_page(vmf->page);
1966 vmf->page->mapping = vma->vm_file->f_mapping;
1967 vmf->page->index = vmf->pgoff;
1968
1825 ret = 0; 1969 ret = 0;
1826unlock: 1970unlock:
1827 rcu_read_unlock(); 1971 rcu_read_unlock();
@@ -1874,6 +2018,14 @@ fail:
1874 return -ENOMEM; 2018 return -ENOMEM;
1875} 2019}
1876 2020
2021static void perf_mmap_free_page(unsigned long addr)
2022{
2023 struct page *page = virt_to_page((void *)addr);
2024
2025 page->mapping = NULL;
2026 __free_page(page);
2027}
2028
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2029static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{ 2030{
1879 struct perf_mmap_data *data; 2031 struct perf_mmap_data *data;
@@ -1881,9 +2033,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1881 2033
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2034 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883 2035
1884 free_page((unsigned long)data->user_page); 2036 perf_mmap_free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++) 2037 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]); 2038 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2039
1887 kfree(data); 2040 kfree(data);
1888} 2041}
1889 2042
@@ -1920,9 +2073,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1920} 2073}
1921 2074
1922static struct vm_operations_struct perf_mmap_vmops = { 2075static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open, 2076 .open = perf_mmap_open,
1924 .close = perf_mmap_close, 2077 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault, 2078 .fault = perf_mmap_fault,
2079 .page_mkwrite = perf_mmap_fault,
1926}; 2080};
1927 2081
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma) 2082static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +2090,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1936 long user_extra, extra; 2090 long user_extra, extra;
1937 int ret = 0; 2091 int ret = 0;
1938 2092
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 2093 if (!(vma->vm_flags & VM_SHARED))
1940 return -EINVAL; 2094 return -EINVAL;
1941 2095
1942 vma_size = vma->vm_end - vma->vm_start; 2096 vma_size = vma->vm_end - vma->vm_start;
@@ -1995,10 +2149,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1995 atomic_long_add(user_extra, &user->locked_vm); 2149 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra; 2150 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra; 2151 counter->data->nr_locked = extra;
2152 if (vma->vm_flags & VM_WRITE)
2153 counter->data->writable = 1;
2154
1998unlock: 2155unlock:
1999 mutex_unlock(&counter->mmap_mutex); 2156 mutex_unlock(&counter->mmap_mutex);
2000 2157
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED; 2158 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops; 2159 vma->vm_ops = &perf_mmap_vmops;
2004 2160
@@ -2175,11 +2331,38 @@ struct perf_output_handle {
2175 unsigned long head; 2331 unsigned long head;
2176 unsigned long offset; 2332 unsigned long offset;
2177 int nmi; 2333 int nmi;
2178 int overflow; 2334 int sample;
2179 int locked; 2335 int locked;
2180 unsigned long flags; 2336 unsigned long flags;
2181}; 2337};
2182 2338
2339static bool perf_output_space(struct perf_mmap_data *data,
2340 unsigned int offset, unsigned int head)
2341{
2342 unsigned long tail;
2343 unsigned long mask;
2344
2345 if (!data->writable)
2346 return true;
2347
2348 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2349 /*
2350 * Userspace could choose to issue a mb() before updating the tail
2351 * pointer. So that all reads will be completed before the write is
2352 * issued.
2353 */
2354 tail = ACCESS_ONCE(data->user_page->data_tail);
2355 smp_rmb();
2356
2357 offset = (offset - tail) & mask;
2358 head = (head - tail) & mask;
2359
2360 if ((int)(head - offset) < 0)
2361 return false;
2362
2363 return true;
2364}
2365
2183static void perf_output_wakeup(struct perf_output_handle *handle) 2366static void perf_output_wakeup(struct perf_output_handle *handle)
2184{ 2367{
2185 atomic_set(&handle->data->poll, POLL_IN); 2368 atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,12 +2453,57 @@ out:
2270 local_irq_restore(handle->flags); 2453 local_irq_restore(handle->flags);
2271} 2454}
2272 2455
2456static void perf_output_copy(struct perf_output_handle *handle,
2457 const void *buf, unsigned int len)
2458{
2459 unsigned int pages_mask;
2460 unsigned int offset;
2461 unsigned int size;
2462 void **pages;
2463
2464 offset = handle->offset;
2465 pages_mask = handle->data->nr_pages - 1;
2466 pages = handle->data->data_pages;
2467
2468 do {
2469 unsigned int page_offset;
2470 int nr;
2471
2472 nr = (offset >> PAGE_SHIFT) & pages_mask;
2473 page_offset = offset & (PAGE_SIZE - 1);
2474 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2475
2476 memcpy(pages[nr] + page_offset, buf, size);
2477
2478 len -= size;
2479 buf += size;
2480 offset += size;
2481 } while (len);
2482
2483 handle->offset = offset;
2484
2485 /*
2486 * Check we didn't copy past our reservation window, taking the
2487 * possible unsigned int wrap into account.
2488 */
2489 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2490}
2491
2492#define perf_output_put(handle, x) \
2493 perf_output_copy((handle), &(x), sizeof(x))
2494
2273static int perf_output_begin(struct perf_output_handle *handle, 2495static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size, 2496 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow) 2497 int nmi, int sample)
2276{ 2498{
2277 struct perf_mmap_data *data; 2499 struct perf_mmap_data *data;
2278 unsigned int offset, head; 2500 unsigned int offset, head;
2501 int have_lost;
2502 struct {
2503 struct perf_event_header header;
2504 u64 id;
2505 u64 lost;
2506 } lost_event;
2279 2507
2280 /* 2508 /*
2281 * For inherited counters we send all the output towards the parent. 2509 * For inherited counters we send all the output towards the parent.
@@ -2288,19 +2516,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
2288 if (!data) 2516 if (!data)
2289 goto out; 2517 goto out;
2290 2518
2291 handle->data = data; 2519 handle->data = data;
2292 handle->counter = counter; 2520 handle->counter = counter;
2293 handle->nmi = nmi; 2521 handle->nmi = nmi;
2294 handle->overflow = overflow; 2522 handle->sample = sample;
2295 2523
2296 if (!data->nr_pages) 2524 if (!data->nr_pages)
2297 goto fail; 2525 goto fail;
2298 2526
2527 have_lost = atomic_read(&data->lost);
2528 if (have_lost)
2529 size += sizeof(lost_event);
2530
2299 perf_output_lock(handle); 2531 perf_output_lock(handle);
2300 2532
2301 do { 2533 do {
2302 offset = head = atomic_long_read(&data->head); 2534 offset = head = atomic_long_read(&data->head);
2303 head += size; 2535 head += size;
2536 if (unlikely(!perf_output_space(data, offset, head)))
2537 goto fail;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2538 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305 2539
2306 handle->offset = offset; 2540 handle->offset = offset;
@@ -2309,55 +2543,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2543 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1); 2544 atomic_set(&data->wakeup, 1);
2311 2545
2546 if (have_lost) {
2547 lost_event.header.type = PERF_EVENT_LOST;
2548 lost_event.header.misc = 0;
2549 lost_event.header.size = sizeof(lost_event);
2550 lost_event.id = counter->id;
2551 lost_event.lost = atomic_xchg(&data->lost, 0);
2552
2553 perf_output_put(handle, lost_event);
2554 }
2555
2312 return 0; 2556 return 0;
2313 2557
2314fail: 2558fail:
2315 perf_output_wakeup(handle); 2559 atomic_inc(&data->lost);
2560 perf_output_unlock(handle);
2316out: 2561out:
2317 rcu_read_unlock(); 2562 rcu_read_unlock();
2318 2563
2319 return -ENOSPC; 2564 return -ENOSPC;
2320} 2565}
2321 2566
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle) 2567static void perf_output_end(struct perf_output_handle *handle)
2362{ 2568{
2363 struct perf_counter *counter = handle->counter; 2569 struct perf_counter *counter = handle->counter;
@@ -2365,7 +2571,7 @@ static void perf_output_end(struct perf_output_handle *handle)
2365 2571
2366 int wakeup_events = counter->attr.wakeup_events; 2572 int wakeup_events = counter->attr.wakeup_events;
2367 2573
2368 if (handle->overflow && wakeup_events) { 2574 if (handle->sample && wakeup_events) {
2369 int events = atomic_inc_return(&data->events); 2575 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) { 2576 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events); 2577 atomic_sub(wakeup_events, &data->events);
@@ -2421,15 +2627,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2421 u32 cpu, reserved; 2627 u32 cpu, reserved;
2422 } cpu_entry; 2628 } cpu_entry;
2423 2629
2424 header.type = 0; 2630 header.type = PERF_EVENT_SAMPLE;
2425 header.size = sizeof(header); 2631 header.size = sizeof(header);
2426 2632
2427 header.misc = PERF_EVENT_MISC_OVERFLOW; 2633 header.misc = 0;
2428 header.misc |= perf_misc_flags(data->regs); 2634 header.misc |= perf_misc_flags(data->regs);
2429 2635
2430 if (sample_type & PERF_SAMPLE_IP) { 2636 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs); 2637 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip); 2638 header.size += sizeof(ip);
2434 } 2639 }
2435 2640
@@ -2438,7 +2643,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2438 tid_entry.pid = perf_counter_pid(counter, current); 2643 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current); 2644 tid_entry.tid = perf_counter_tid(counter, current);
2440 2645
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry); 2646 header.size += sizeof(tid_entry);
2443 } 2647 }
2444 2648
@@ -2448,34 +2652,25 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2448 */ 2652 */
2449 time = sched_clock(); 2653 time = sched_clock();
2450 2654
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64); 2655 header.size += sizeof(u64);
2453 } 2656 }
2454 2657
2455 if (sample_type & PERF_SAMPLE_ADDR) { 2658 if (sample_type & PERF_SAMPLE_ADDR)
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64); 2659 header.size += sizeof(u64);
2458 }
2459 2660
2460 if (sample_type & PERF_SAMPLE_ID) { 2661 if (sample_type & PERF_SAMPLE_ID)
2461 header.type |= PERF_SAMPLE_ID;
2462 header.size += sizeof(u64); 2662 header.size += sizeof(u64);
2463 }
2464 2663
2465 if (sample_type & PERF_SAMPLE_CPU) { 2664 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry); 2665 header.size += sizeof(cpu_entry);
2468 2666
2469 cpu_entry.cpu = raw_smp_processor_id(); 2667 cpu_entry.cpu = raw_smp_processor_id();
2470 } 2668 }
2471 2669
2472 if (sample_type & PERF_SAMPLE_PERIOD) { 2670 if (sample_type & PERF_SAMPLE_PERIOD)
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64); 2671 header.size += sizeof(u64);
2475 }
2476 2672
2477 if (sample_type & PERF_SAMPLE_GROUP) { 2673 if (sample_type & PERF_SAMPLE_GROUP) {
2478 header.type |= PERF_SAMPLE_GROUP;
2479 header.size += sizeof(u64) + 2674 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry); 2675 counter->nr_siblings * sizeof(group_entry);
2481 } 2676 }
@@ -2485,10 +2680,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2485 2680
2486 if (callchain) { 2681 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64); 2682 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size; 2683 header.size += callchain_size;
2491 } 2684 } else
2685 header.size += sizeof(u64);
2492 } 2686 }
2493 2687
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2688 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2539,13 +2733,79 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2539 } 2733 }
2540 } 2734 }
2541 2735
2542 if (callchain) 2736 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2543 perf_output_copy(&handle, callchain, callchain_size); 2737 if (callchain)
2738 perf_output_copy(&handle, callchain, callchain_size);
2739 else {
2740 u64 nr = 0;
2741 perf_output_put(&handle, nr);
2742 }
2743 }
2544 2744
2545 perf_output_end(&handle); 2745 perf_output_end(&handle);
2546} 2746}
2547 2747
2548/* 2748/*
2749 * read event
2750 */
2751
2752struct perf_read_event {
2753 struct perf_event_header header;
2754
2755 u32 pid;
2756 u32 tid;
2757 u64 value;
2758 u64 format[3];
2759};
2760
2761static void
2762perf_counter_read_event(struct perf_counter *counter,
2763 struct task_struct *task)
2764{
2765 struct perf_output_handle handle;
2766 struct perf_read_event event = {
2767 .header = {
2768 .type = PERF_EVENT_READ,
2769 .misc = 0,
2770 .size = sizeof(event) - sizeof(event.format),
2771 },
2772 .pid = perf_counter_pid(counter, task),
2773 .tid = perf_counter_tid(counter, task),
2774 .value = atomic64_read(&counter->count),
2775 };
2776 int ret, i = 0;
2777
2778 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2779 event.header.size += sizeof(u64);
2780 event.format[i++] = counter->total_time_enabled;
2781 }
2782
2783 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2784 event.header.size += sizeof(u64);
2785 event.format[i++] = counter->total_time_running;
2786 }
2787
2788 if (counter->attr.read_format & PERF_FORMAT_ID) {
2789 u64 id;
2790
2791 event.header.size += sizeof(u64);
2792 if (counter->parent)
2793 id = counter->parent->id;
2794 else
2795 id = counter->id;
2796
2797 event.format[i++] = id;
2798 }
2799
2800 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
2801 if (ret)
2802 return;
2803
2804 perf_output_copy(&handle, &event, event.header.size);
2805 perf_output_end(&handle);
2806}
2807
2808/*
2549 * fork tracking 2809 * fork tracking
2550 */ 2810 */
2551 2811
@@ -2736,6 +2996,9 @@ void perf_counter_comm(struct task_struct *task)
2736{ 2996{
2737 struct perf_comm_event comm_event; 2997 struct perf_comm_event comm_event;
2738 2998
2999 if (task->perf_counter_ctxp)
3000 perf_counter_enable_on_exec(task);
3001
2739 if (!atomic_read(&nr_comm_counters)) 3002 if (!atomic_read(&nr_comm_counters))
2740 return; 3003 return;
2741 3004
@@ -2970,7 +3233,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2970} 3233}
2971 3234
2972/* 3235/*
2973 * Generic counter overflow handling. 3236 * Generic counter overflow handling, sampling.
2974 */ 3237 */
2975 3238
2976int perf_counter_overflow(struct perf_counter *counter, int nmi, 3239int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3109,20 +3372,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3109} 3372}
3110 3373
3111static void perf_swcounter_overflow(struct perf_counter *counter, 3374static void perf_swcounter_overflow(struct perf_counter *counter,
3112 int nmi, struct pt_regs *regs, u64 addr) 3375 int nmi, struct perf_sample_data *data)
3113{ 3376{
3114 struct perf_sample_data data = { 3377 data->period = counter->hw.last_period;
3115 .regs = regs,
3116 .addr = addr,
3117 .period = counter->hw.last_period,
3118 };
3119 3378
3120 perf_swcounter_update(counter); 3379 perf_swcounter_update(counter);
3121 perf_swcounter_set_period(counter); 3380 perf_swcounter_set_period(counter);
3122 if (perf_counter_overflow(counter, nmi, &data)) 3381 if (perf_counter_overflow(counter, nmi, data))
3123 /* soft-disable the counter */ 3382 /* soft-disable the counter */
3124 ; 3383 ;
3125
3126} 3384}
3127 3385
3128static int perf_swcounter_is_counting(struct perf_counter *counter) 3386static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3187,18 +3445,18 @@ static int perf_swcounter_match(struct perf_counter *counter,
3187} 3445}
3188 3446
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr, 3447static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr) 3448 int nmi, struct perf_sample_data *data)
3191{ 3449{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count); 3450 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193 3451
3194 if (counter->hw.sample_period && !neg && regs) 3452 if (counter->hw.sample_period && !neg && data->regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr); 3453 perf_swcounter_overflow(counter, nmi, data);
3196} 3454}
3197 3455
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3456static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event, 3457 enum perf_type_id type,
3200 u64 nr, int nmi, struct pt_regs *regs, 3458 u32 event, u64 nr, int nmi,
3201 u64 addr) 3459 struct perf_sample_data *data)
3202{ 3460{
3203 struct perf_counter *counter; 3461 struct perf_counter *counter;
3204 3462
@@ -3207,8 +3465,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3207 3465
3208 rcu_read_lock(); 3466 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3467 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs)) 3468 if (perf_swcounter_match(counter, type, event, data->regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr); 3469 perf_swcounter_add(counter, nr, nmi, data);
3212 } 3470 }
3213 rcu_read_unlock(); 3471 rcu_read_unlock();
3214} 3472}
@@ -3227,9 +3485,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3227 return &cpuctx->recursion[0]; 3485 return &cpuctx->recursion[0];
3228} 3486}
3229 3487
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event, 3488static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs, 3489 u64 nr, int nmi,
3232 u64 addr) 3490 struct perf_sample_data *data)
3233{ 3491{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3492 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx); 3493 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3500,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3242 barrier(); 3500 barrier();
3243 3501
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3502 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr); 3503 nr, nmi, data);
3246 rcu_read_lock(); 3504 rcu_read_lock();
3247 /* 3505 /*
3248 * doesn't really matter which of the child contexts the 3506 * doesn't really matter which of the child contexts the
@@ -3250,7 +3508,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3250 */ 3508 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp); 3509 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx) 3510 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); 3511 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3254 rcu_read_unlock(); 3512 rcu_read_unlock();
3255 3513
3256 barrier(); 3514 barrier();
@@ -3260,10 +3518,15 @@ out:
3260 put_cpu_var(perf_cpu_context); 3518 put_cpu_var(perf_cpu_context);
3261} 3519}
3262 3520
3263void 3521void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3522 struct pt_regs *regs, u64 addr)
3265{ 3523{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); 3524 struct perf_sample_data data = {
3525 .regs = regs,
3526 .addr = addr,
3527 };
3528
3529 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3267} 3530}
3268 3531
3269static void perf_swcounter_read(struct perf_counter *counter) 3532static void perf_swcounter_read(struct perf_counter *counter)
@@ -3404,36 +3667,18 @@ static const struct pmu perf_ops_task_clock = {
3404 .read = task_clock_perf_counter_read, 3667 .read = task_clock_perf_counter_read,
3405}; 3668};
3406 3669
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE 3670#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id) 3671void perf_tpcounter_event(int event_id)
3430{ 3672{
3431 struct pt_regs *regs = get_irq_regs(); 3673 struct perf_sample_data data = {
3674 .regs = get_irq_regs();
3675 .addr = 0,
3676 };
3432 3677
3433 if (!regs) 3678 if (!data.regs)
3434 regs = task_pt_regs(current); 3679 data.regs = task_pt_regs(current);
3435 3680
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); 3681 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
3437} 3682}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3683EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439 3684
@@ -3465,9 +3710,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3465} 3710}
3466#endif 3711#endif
3467 3712
3713atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3714
3715static void sw_perf_counter_destroy(struct perf_counter *counter)
3716{
3717 u64 event = counter->attr.config;
3718
3719 WARN_ON(counter->parent);
3720
3721 atomic_dec(&perf_swcounter_enabled[event]);
3722}
3723
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 3724static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{ 3725{
3470 const struct pmu *pmu = NULL; 3726 const struct pmu *pmu = NULL;
3727 u64 event = counter->attr.config;
3471 3728
3472 /* 3729 /*
3473 * Software counters (currently) can't in general distinguish 3730 * Software counters (currently) can't in general distinguish
@@ -3476,7 +3733,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3476 * to be kernel events, and page faults are never hypervisor 3733 * to be kernel events, and page faults are never hypervisor
3477 * events. 3734 * events.
3478 */ 3735 */
3479 switch (counter->attr.config) { 3736 switch (event) {
3480 case PERF_COUNT_SW_CPU_CLOCK: 3737 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock; 3738 pmu = &perf_ops_cpu_clock;
3482 3739
@@ -3497,6 +3754,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 3754 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES: 3755 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS: 3756 case PERF_COUNT_SW_CPU_MIGRATIONS:
3757 if (!counter->parent) {
3758 atomic_inc(&perf_swcounter_enabled[event]);
3759 counter->destroy = sw_perf_counter_destroy;
3760 }
3500 pmu = &perf_ops_generic; 3761 pmu = &perf_ops_generic;
3501 break; 3762 break;
3502 } 3763 }
@@ -3512,6 +3773,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu, 3773 int cpu,
3513 struct perf_counter_context *ctx, 3774 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader, 3775 struct perf_counter *group_leader,
3776 struct perf_counter *parent_counter,
3515 gfp_t gfpflags) 3777 gfp_t gfpflags)
3516{ 3778{
3517 const struct pmu *pmu; 3779 const struct pmu *pmu;
@@ -3547,6 +3809,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3547 counter->ctx = ctx; 3809 counter->ctx = ctx;
3548 counter->oncpu = -1; 3810 counter->oncpu = -1;
3549 3811
3812 counter->parent = parent_counter;
3813
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 3814 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id); 3815 counter->id = atomic64_inc_return(&perf_counter_id);
3552 3816
@@ -3604,11 +3868,13 @@ done:
3604 3868
3605 counter->pmu = pmu; 3869 counter->pmu = pmu;
3606 3870
3607 atomic_inc(&nr_counters); 3871 if (!counter->parent) {
3608 if (counter->attr.mmap) 3872 atomic_inc(&nr_counters);
3609 atomic_inc(&nr_mmap_counters); 3873 if (counter->attr.mmap)
3610 if (counter->attr.comm) 3874 atomic_inc(&nr_mmap_counters);
3611 atomic_inc(&nr_comm_counters); 3875 if (counter->attr.comm)
3876 atomic_inc(&nr_comm_counters);
3877 }
3612 3878
3613 return counter; 3879 return counter;
3614} 3880}
@@ -3771,7 +4037,7 @@ SYSCALL_DEFINE5(perf_counter_open,
3771 } 4037 }
3772 4038
3773 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4039 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3774 GFP_KERNEL); 4040 NULL, GFP_KERNEL);
3775 ret = PTR_ERR(counter); 4041 ret = PTR_ERR(counter);
3776 if (IS_ERR(counter)) 4042 if (IS_ERR(counter))
3777 goto err_put_context; 4043 goto err_put_context;
@@ -3837,7 +4103,8 @@ inherit_counter(struct perf_counter *parent_counter,
3837 4103
3838 child_counter = perf_counter_alloc(&parent_counter->attr, 4104 child_counter = perf_counter_alloc(&parent_counter->attr,
3839 parent_counter->cpu, child_ctx, 4105 parent_counter->cpu, child_ctx,
3840 group_leader, GFP_KERNEL); 4106 group_leader, parent_counter,
4107 GFP_KERNEL);
3841 if (IS_ERR(child_counter)) 4108 if (IS_ERR(child_counter))
3842 return child_counter; 4109 return child_counter;
3843 get_ctx(child_ctx); 4110 get_ctx(child_ctx);
@@ -3860,12 +4127,6 @@ inherit_counter(struct perf_counter *parent_counter,
3860 */ 4127 */
3861 add_counter_to_ctx(child_counter, child_ctx); 4128 add_counter_to_ctx(child_counter, child_ctx);
3862 4129
3863 child_counter->parent = parent_counter;
3864 /*
3865 * inherit into child's child as well:
3866 */
3867 child_counter->attr.inherit = 1;
3868
3869 /* 4130 /*
3870 * Get a reference to the parent filp - we will fput it 4131 * Get a reference to the parent filp - we will fput it
3871 * when the child counter exits. This is safe to do because 4132 * when the child counter exits. This is safe to do because
@@ -3909,10 +4170,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3909} 4170}
3910 4171
3911static void sync_child_counter(struct perf_counter *child_counter, 4172static void sync_child_counter(struct perf_counter *child_counter,
3912 struct perf_counter *parent_counter) 4173 struct task_struct *child)
3913{ 4174{
4175 struct perf_counter *parent_counter = child_counter->parent;
3914 u64 child_val; 4176 u64 child_val;
3915 4177
4178 if (child_counter->attr.inherit_stat)
4179 perf_counter_read_event(child_counter, child);
4180
3916 child_val = atomic64_read(&child_counter->count); 4181 child_val = atomic64_read(&child_counter->count);
3917 4182
3918 /* 4183 /*
@@ -3941,7 +4206,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3941 4206
3942static void 4207static void
3943__perf_counter_exit_task(struct perf_counter *child_counter, 4208__perf_counter_exit_task(struct perf_counter *child_counter,
3944 struct perf_counter_context *child_ctx) 4209 struct perf_counter_context *child_ctx,
4210 struct task_struct *child)
3945{ 4211{
3946 struct perf_counter *parent_counter; 4212 struct perf_counter *parent_counter;
3947 4213
@@ -3955,7 +4221,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3955 * counters need to be zapped - but otherwise linger. 4221 * counters need to be zapped - but otherwise linger.
3956 */ 4222 */
3957 if (parent_counter) { 4223 if (parent_counter) {
3958 sync_child_counter(child_counter, parent_counter); 4224 sync_child_counter(child_counter, child);
3959 free_counter(child_counter); 4225 free_counter(child_counter);
3960 } 4226 }
3961} 4227}
@@ -4017,7 +4283,7 @@ void perf_counter_exit_task(struct task_struct *child)
4017again: 4283again:
4018 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4284 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4019 list_entry) 4285 list_entry)
4020 __perf_counter_exit_task(child_counter, child_ctx); 4286 __perf_counter_exit_task(child_counter, child_ctx, child);
4021 4287
4022 /* 4288 /*
4023 * If the last counter was a group counter, it will have appended all 4289 * If the last counter was a group counter, it will have appended all
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd281..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */ 380 */
381struct task_struct *find_task_by_pid_type_ns(int type, int nr, 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 struct pid_namespace *ns)
383{ 382{
384 return pid_task(find_pid_ns(nr, ns), type); 383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
385} 384}
386 385
387EXPORT_SYMBOL(find_task_by_pid_type_ns);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 386struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 387{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399} 389}
400EXPORT_SYMBOL(find_task_by_pid_ns);
401 390
402struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 391struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 392{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858d..821722ae58a7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
67 return NULL; 67 return NULL;
68} 68}
69 69
70static struct pid_namespace *create_pid_namespace(unsigned int level) 70static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
71{ 71{
72 struct pid_namespace *ns; 72 struct pid_namespace *ns;
73 unsigned int level = parent_pid_ns->level + 1;
73 int i; 74 int i;
74 75
75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 76 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
86 87
87 kref_init(&ns->kref); 88 kref_init(&ns->kref);
88 ns->level = level; 89 ns->level = level;
90 ns->parent = get_pid_ns(parent_pid_ns);
89 91
90 set_bit(0, ns->pidmap[0].page); 92 set_bit(0, ns->pidmap[0].page);
91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 93 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
114 116
115struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 117struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
116{ 118{
117 struct pid_namespace *new_ns;
118
119 BUG_ON(!old_ns);
120 new_ns = get_pid_ns(old_ns);
121 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
122 goto out; 120 return get_pid_ns(old_ns);
123
124 new_ns = ERR_PTR(-EINVAL);
125 if (flags & CLONE_THREAD) 121 if (flags & CLONE_THREAD)
126 goto out_put; 122 return ERR_PTR(-EINVAL);
127 123 return create_pid_namespace(old_ns);
128 new_ns = create_pid_namespace(old_ns->level + 1);
129 if (!IS_ERR(new_ns))
130 new_ns->parent = get_pid_ns(old_ns);
131
132out_put:
133 put_pid_ns(old_ns);
134out:
135 return new_ns;
136} 124}
137 125
138void free_pid_ns(struct kref *kref) 126void free_pid_ns(struct kref *kref)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f6d8b8cb5e34..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -167,67 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
167int ptrace_attach(struct task_struct *task) 167int ptrace_attach(struct task_struct *task)
168{ 168{
169 int retval; 169 int retval;
170 unsigned long flags;
171 170
172 audit_ptrace(task); 171 audit_ptrace(task);
173 172
174 retval = -EPERM; 173 retval = -EPERM;
174 if (unlikely(task->flags & PF_KTHREAD))
175 goto out;
175 if (same_thread_group(task, current)) 176 if (same_thread_group(task, current))
176 goto out; 177 goto out;
177 178
178 /* Protect the target's credential calculations against our 179 /*
180 * Protect exec's credential calculations against our interference;
179 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
180 * under ptrace. 182 * under ptrace.
181 */ 183 */
182 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
183 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
184 goto out; 186 goto out;
185 187
186 retval = -EPERM;
187repeat:
188 /*
189 * Nasty, nasty.
190 *
191 * We want to hold both the task-lock and the
192 * tasklist_lock for writing at the same time.
193 * But that's against the rules (tasklist_lock
194 * is taken for reading by interrupts on other
195 * cpu's that may have task_lock).
196 */
197 task_lock(task); 188 task_lock(task);
198 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
199 task_unlock(task);
200 do {
201 cpu_relax();
202 } while (!write_can_lock(&tasklist_lock));
203 goto repeat;
204 }
205
206 if (!task->mm)
207 goto bad;
208 /* the same process cannot be attached many times */
209 if (task->ptrace & PT_PTRACED)
210 goto bad;
211 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); 189 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
190 task_unlock(task);
212 if (retval) 191 if (retval)
213 goto bad; 192 goto unlock_creds;
214 193
215 /* Go */ 194 write_lock_irq(&tasklist_lock);
216 task->ptrace |= PT_PTRACED; 195 retval = -EPERM;
196 if (unlikely(task->exit_state))
197 goto unlock_tasklist;
198 if (task->ptrace)
199 goto unlock_tasklist;
200
201 task->ptrace = PT_PTRACED;
217 if (capable(CAP_SYS_PTRACE)) 202 if (capable(CAP_SYS_PTRACE))
218 task->ptrace |= PT_PTRACE_CAP; 203 task->ptrace |= PT_PTRACE_CAP;
219 204
220 __ptrace_link(task, current); 205 __ptrace_link(task, current);
221
222 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 206 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
223bad: 207
224 write_unlock_irqrestore(&tasklist_lock, flags); 208 retval = 0;
225 task_unlock(task); 209unlock_tasklist:
210 write_unlock_irq(&tasklist_lock);
211unlock_creds:
226 mutex_unlock(&task->cred_guard_mutex); 212 mutex_unlock(&task->cred_guard_mutex);
227out: 213out:
228 return retval; 214 return retval;
229} 215}
230 216
217/**
218 * ptrace_traceme -- helper for PTRACE_TRACEME
219 *
220 * Performs checks and sets PT_PTRACED.
221 * Should be used by all ptrace implementations for PTRACE_TRACEME.
222 */
223int ptrace_traceme(void)
224{
225 int ret = -EPERM;
226
227 write_lock_irq(&tasklist_lock);
228 /* Are we already being traced? */
229 if (!current->ptrace) {
230 ret = security_ptrace_traceme(current->parent);
231 /*
232 * Check PF_EXITING to ensure ->real_parent has not passed
233 * exit_ptrace(). Otherwise we don't report the error but
234 * pretend ->real_parent untraces us right after return.
235 */
236 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
237 current->ptrace = PT_PTRACED;
238 __ptrace_link(current, current->real_parent);
239 }
240 }
241 write_unlock_irq(&tasklist_lock);
242
243 return ret;
244}
245
231/* 246/*
232 * Called with irqs disabled, returns true if childs should reap themselves. 247 * Called with irqs disabled, returns true if childs should reap themselves.
233 */ 248 */
@@ -409,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
409 424
410static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 425static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
411{ 426{
427 unsigned long flags;
412 int error = -ESRCH; 428 int error = -ESRCH;
413 429
414 read_lock(&tasklist_lock); 430 if (lock_task_sighand(child, &flags)) {
415 if (likely(child->sighand != NULL)) {
416 error = -EINVAL; 431 error = -EINVAL;
417 spin_lock_irq(&child->sighand->siglock);
418 if (likely(child->last_siginfo != NULL)) { 432 if (likely(child->last_siginfo != NULL)) {
419 *info = *child->last_siginfo; 433 *info = *child->last_siginfo;
420 error = 0; 434 error = 0;
421 } 435 }
422 spin_unlock_irq(&child->sighand->siglock); 436 unlock_task_sighand(child, &flags);
423 } 437 }
424 read_unlock(&tasklist_lock);
425 return error; 438 return error;
426} 439}
427 440
428static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) 441static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
429{ 442{
443 unsigned long flags;
430 int error = -ESRCH; 444 int error = -ESRCH;
431 445
432 read_lock(&tasklist_lock); 446 if (lock_task_sighand(child, &flags)) {
433 if (likely(child->sighand != NULL)) {
434 error = -EINVAL; 447 error = -EINVAL;
435 spin_lock_irq(&child->sighand->siglock);
436 if (likely(child->last_siginfo != NULL)) { 448 if (likely(child->last_siginfo != NULL)) {
437 *child->last_siginfo = *info; 449 *child->last_siginfo = *info;
438 error = 0; 450 error = 0;
439 } 451 }
440 spin_unlock_irq(&child->sighand->siglock); 452 unlock_task_sighand(child, &flags);
441 } 453 }
442 read_unlock(&tasklist_lock);
443 return error; 454 return error;
444} 455}
445 456
@@ -566,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
566 return ret; 577 return ret;
567} 578}
568 579
569/** 580static struct task_struct *ptrace_get_task_struct(pid_t pid)
570 * ptrace_traceme -- helper for PTRACE_TRACEME
571 *
572 * Performs checks and sets PT_PTRACED.
573 * Should be used by all ptrace implementations for PTRACE_TRACEME.
574 */
575int ptrace_traceme(void)
576{
577 int ret = -EPERM;
578
579 /*
580 * Are we already being traced?
581 */
582repeat:
583 task_lock(current);
584 if (!(current->ptrace & PT_PTRACED)) {
585 /*
586 * See ptrace_attach() comments about the locking here.
587 */
588 unsigned long flags;
589 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
590 task_unlock(current);
591 do {
592 cpu_relax();
593 } while (!write_can_lock(&tasklist_lock));
594 goto repeat;
595 }
596
597 ret = security_ptrace_traceme(current->parent);
598
599 /*
600 * Check PF_EXITING to ensure ->real_parent has not passed
601 * exit_ptrace(). Otherwise we don't report the error but
602 * pretend ->real_parent untraces us right after return.
603 */
604 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
605 current->ptrace |= PT_PTRACED;
606 __ptrace_link(current, current->real_parent);
607 }
608
609 write_unlock_irqrestore(&tasklist_lock, flags);
610 }
611 task_unlock(current);
612 return ret;
613}
614
615/**
616 * ptrace_get_task_struct -- grab a task struct reference for ptrace
617 * @pid: process id to grab a task_struct reference of
618 *
619 * This function is a helper for ptrace implementations. It checks
620 * permissions and then grabs a task struct for use of the actual
621 * ptrace implementation.
622 *
623 * Returns the task_struct for @pid or an ERR_PTR() on failure.
624 */
625struct task_struct *ptrace_get_task_struct(pid_t pid)
626{ 581{
627 struct task_struct *child; 582 struct task_struct *child;
628 583
629 read_lock(&tasklist_lock); 584 rcu_read_lock();
630 child = find_task_by_vpid(pid); 585 child = find_task_by_vpid(pid);
631 if (child) 586 if (child)
632 get_task_struct(child); 587 get_task_struct(child);
588 rcu_read_unlock();
633 589
634 read_unlock(&tasklist_lock);
635 if (!child) 590 if (!child)
636 return ERR_PTR(-ESRCH); 591 return ERR_PTR(-ESRCH);
637 return child; 592 return child;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1533,7 +1533,7 @@ void __init __rcu_init(void)
1533 int j; 1533 int j;
1534 struct rcu_node *rnp; 1534 struct rcu_node *rnp;
1535 1535
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1536 printk(KERN_INFO "Hierarchical RCU implementation.\n");
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1546,7 +1546,6 @@ void __init __rcu_init(void)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); 1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */ 1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb); 1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1549}
1551 1550
1552module_param(blimit, int, 0); 1551module_param(blimit, int, 0);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c803..e1338f074314 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
18void res_counter_init(struct res_counter *counter, struct res_counter *parent) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
133 unsigned long long *res) 133 unsigned long long *res)
134{ 134{
135 char *end; 135 char *end;
136
137 /* return RESOURCE_MAX(unlimited) if "-1" is specified */
138 if (*buf == '-') {
139 *res = simple_strtoull(buf + 1, &end, 10);
140 if (*res != 1 || *end != '\0')
141 return -EINVAL;
142 *res = RESOURCE_MAX;
143 return 0;
144 }
145
136 /* FIXME - make memparse() take const char* args */ 146 /* FIXME - make memparse() take const char* args */
137 *res = memparse((char *)buf, &end); 147 *res = memparse((char *)buf, &end);
138 if (*end != '\0') 148 if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a906aaa..01f55ada3598 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1978 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1979 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1980#endif 1980#endif
1981 perf_counter_task_migration(p, new_cpu); 1981 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1982 1, 1, NULL, 0);
1982 } 1983 }
1983 p->se.vruntime -= old_cfsrq->min_vruntime - 1984 p->se.vruntime -= old_cfsrq->min_vruntime -
1984 new_cfsrq->min_vruntime; 1985 new_cfsrq->min_vruntime;
@@ -6540,6 +6541,11 @@ SYSCALL_DEFINE0(sched_yield)
6540 return 0; 6541 return 0;
6541} 6542}
6542 6543
6544static inline int should_resched(void)
6545{
6546 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6547}
6548
6543static void __cond_resched(void) 6549static void __cond_resched(void)
6544{ 6550{
6545#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6551#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6559,8 +6565,7 @@ static void __cond_resched(void)
6559 6565
6560int __sched _cond_resched(void) 6566int __sched _cond_resched(void)
6561{ 6567{
6562 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6568 if (should_resched()) {
6563 system_state == SYSTEM_RUNNING) {
6564 __cond_resched(); 6569 __cond_resched();
6565 return 1; 6570 return 1;
6566 } 6571 }
@@ -6578,12 +6583,12 @@ EXPORT_SYMBOL(_cond_resched);
6578 */ 6583 */
6579int cond_resched_lock(spinlock_t *lock) 6584int cond_resched_lock(spinlock_t *lock)
6580{ 6585{
6581 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6586 int resched = should_resched();
6582 int ret = 0; 6587 int ret = 0;
6583 6588
6584 if (spin_needbreak(lock) || resched) { 6589 if (spin_needbreak(lock) || resched) {
6585 spin_unlock(lock); 6590 spin_unlock(lock);
6586 if (resched && need_resched()) 6591 if (resched)
6587 __cond_resched(); 6592 __cond_resched();
6588 else 6593 else
6589 cpu_relax(); 6594 cpu_relax();
@@ -6598,7 +6603,7 @@ int __sched cond_resched_softirq(void)
6598{ 6603{
6599 BUG_ON(!in_softirq()); 6604 BUG_ON(!in_softirq());
6600 6605
6601 if (need_resched() && system_state == SYSTEM_RUNNING) { 6606 if (should_resched()) {
6602 local_bh_enable(); 6607 local_bh_enable();
6603 __cond_resched(); 6608 __cond_resched();
6604 local_bh_disable(); 6609 local_bh_disable();
@@ -7045,7 +7050,7 @@ static int migration_thread(void *data)
7045 7050
7046 if (cpu_is_offline(cpu)) { 7051 if (cpu_is_offline(cpu)) {
7047 spin_unlock_irq(&rq->lock); 7052 spin_unlock_irq(&rq->lock);
7048 goto wait_to_die; 7053 break;
7049 } 7054 }
7050 7055
7051 if (rq->active_balance) { 7056 if (rq->active_balance) {
@@ -7071,16 +7076,7 @@ static int migration_thread(void *data)
7071 complete(&req->done); 7076 complete(&req->done);
7072 } 7077 }
7073 __set_current_state(TASK_RUNNING); 7078 __set_current_state(TASK_RUNNING);
7074 return 0;
7075 7079
7076wait_to_die:
7077 /* Wait for kthread_stop */
7078 set_current_state(TASK_INTERRUPTIBLE);
7079 while (!kthread_should_stop()) {
7080 schedule();
7081 set_current_state(TASK_INTERRUPTIBLE);
7082 }
7083 __set_current_state(TASK_RUNNING);
7084 return 0; 7080 return 0;
7085} 7081}
7086 7082
@@ -7494,6 +7490,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7494 rq = task_rq_lock(p, &flags); 7490 rq = task_rq_lock(p, &flags);
7495 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7491 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7496 task_rq_unlock(rq, &flags); 7492 task_rq_unlock(rq, &flags);
7493 get_task_struct(p);
7497 cpu_rq(cpu)->migration_thread = p; 7494 cpu_rq(cpu)->migration_thread = p;
7498 break; 7495 break;
7499 7496
@@ -7524,6 +7521,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7524 kthread_bind(cpu_rq(cpu)->migration_thread, 7521 kthread_bind(cpu_rq(cpu)->migration_thread,
7525 cpumask_any(cpu_online_mask)); 7522 cpumask_any(cpu_online_mask));
7526 kthread_stop(cpu_rq(cpu)->migration_thread); 7523 kthread_stop(cpu_rq(cpu)->migration_thread);
7524 put_task_struct(cpu_rq(cpu)->migration_thread);
7527 cpu_rq(cpu)->migration_thread = NULL; 7525 cpu_rq(cpu)->migration_thread = NULL;
7528 break; 7526 break;
7529 7527
@@ -7533,6 +7531,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7533 migrate_live_tasks(cpu); 7531 migrate_live_tasks(cpu);
7534 rq = cpu_rq(cpu); 7532 rq = cpu_rq(cpu);
7535 kthread_stop(rq->migration_thread); 7533 kthread_stop(rq->migration_thread);
7534 put_task_struct(rq->migration_thread);
7536 rq->migration_thread = NULL; 7535 rq->migration_thread = NULL;
7537 /* Idle task back to normal (off runqueue, low prio) */ 7536 /* Idle task back to normal (off runqueue, low prio) */
7538 spin_lock_irq(&rq->lock); 7537 spin_lock_irq(&rq->lock);
@@ -7828,7 +7827,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7828 free_rootdomain(old_rd); 7827 free_rootdomain(old_rd);
7829} 7828}
7830 7829
7831static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7830static int init_rootdomain(struct root_domain *rd, bool bootmem)
7832{ 7831{
7833 gfp_t gfp = GFP_KERNEL; 7832 gfp_t gfp = GFP_KERNEL;
7834 7833
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7deffc9f0e5f..e6c251790dde 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
152 * 152 *
153 * Returns: -ENOMEM if memory fails. 153 * Returns: -ENOMEM if memory fails.
154 */ 154 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 155int cpupri_init(struct cpupri *cp, bool bootmem)
156{ 156{
157 gfp_t gfp = GFP_KERNEL; 157 gfp_t gfp = GFP_KERNEL;
158 int i; 158 int i;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162{ 162{
163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
164 spread, rq0_min_vruntime, spread0; 164 spread, rq0_min_vruntime, spread0;
165 struct rq *rq = &per_cpu(runqueues, cpu); 165 struct rq *rq = cpu_rq(cpu);
166 struct sched_entity *last; 166 struct sched_entity *last;
167 unsigned long flags; 167 unsigned long flags;
168 168
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 if (last) 191 if (last)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
248 248
249static void print_cpu(struct seq_file *m, int cpu) 249static void print_cpu(struct seq_file *m, int cpu)
250{ 250{
251 struct rq *rq = &per_cpu(runqueues, cpu); 251 struct rq *rq = cpu_rq(cpu);
252 252
253#ifdef CONFIG_X86 253#ifdef CONFIG_X86
254 { 254 {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f9650e8fe75..ba7fd6e9556f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
430 430
431 for_each_sched_entity(se) { 431 for_each_sched_entity(se) {
432 struct load_weight *load; 432 struct load_weight *load;
433 struct load_weight lw;
433 434
434 cfs_rq = cfs_rq_of(se); 435 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load; 436 load = &cfs_rq->load;
436 437
437 if (unlikely(!se->on_rq)) { 438 if (unlikely(!se->on_rq)) {
438 struct load_weight lw = cfs_rq->load; 439 lw = cfs_rq->load;
439 440
440 update_load_add(&lw, se->load.weight); 441 update_load_add(&lw, se->load.weight);
441 load = &lw; 442 load = &lw;
diff --git a/kernel/signal.c b/kernel/signal.c
index d81f4952eebb..ccf1ceedaebe 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1410,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1410 /* do_notify_parent_cldstop should have been called instead. */ 1410 /* do_notify_parent_cldstop should have been called instead. */
1411 BUG_ON(task_is_stopped_or_traced(tsk)); 1411 BUG_ON(task_is_stopped_or_traced(tsk));
1412 1412
1413 BUG_ON(!tsk->ptrace && 1413 BUG_ON(!task_ptrace(tsk) &&
1414 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1414 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1415 1415
1416 info.si_signo = sig; 1416 info.si_signo = sig;
@@ -1449,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1449 1449
1450 psig = tsk->parent->sighand; 1450 psig = tsk->parent->sighand;
1451 spin_lock_irqsave(&psig->siglock, flags); 1451 spin_lock_irqsave(&psig->siglock, flags);
1452 if (!tsk->ptrace && sig == SIGCHLD && 1452 if (!task_ptrace(tsk) && sig == SIGCHLD &&
1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1455 /* 1455 /*
@@ -1486,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1486 struct task_struct *parent; 1486 struct task_struct *parent;
1487 struct sighand_struct *sighand; 1487 struct sighand_struct *sighand;
1488 1488
1489 if (tsk->ptrace & PT_PTRACED) 1489 if (task_ptrace(tsk))
1490 parent = tsk->parent; 1490 parent = tsk->parent;
1491 else { 1491 else {
1492 tsk = tsk->group_leader; 1492 tsk = tsk->group_leader;
@@ -1499,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1499 * see comment in do_notify_parent() abot the following 3 lines 1499 * see comment in do_notify_parent() abot the following 3 lines
1500 */ 1500 */
1501 rcu_read_lock(); 1501 rcu_read_lock();
1502 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1502 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1503 info.si_uid = __task_cred(tsk)->uid; 1503 info.si_uid = __task_cred(tsk)->uid;
1504 rcu_read_unlock(); 1504 rcu_read_unlock();
1505 1505
@@ -1535,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1535 1535
1536static inline int may_ptrace_stop(void) 1536static inline int may_ptrace_stop(void)
1537{ 1537{
1538 if (!likely(current->ptrace & PT_PTRACED)) 1538 if (!likely(task_ptrace(current)))
1539 return 0; 1539 return 0;
1540 /* 1540 /*
1541 * Are we in the middle of do_coredump? 1541 * Are we in the middle of do_coredump?
@@ -1753,7 +1753,7 @@ static int do_signal_stop(int signr)
1753static int ptrace_signal(int signr, siginfo_t *info, 1753static int ptrace_signal(int signr, siginfo_t *info,
1754 struct pt_regs *regs, void *cookie) 1754 struct pt_regs *regs, void *cookie)
1755{ 1755{
1756 if (!(current->ptrace & PT_PTRACED)) 1756 if (!task_ptrace(current))
1757 return signr; 1757 return signr;
1758 1758
1759 ptrace_signal_deliver(regs, cookie); 1759 ptrace_signal_deliver(regs, cookie);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b41fb710e114..3a94905fa5d2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -213,6 +213,7 @@ restart:
213 do { 213 do {
214 if (pending & 1) { 214 if (pending & 1) {
215 int prev_count = preempt_count(); 215 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
216 217
217 trace_softirq_entry(h, softirq_vec); 218 trace_softirq_entry(h, softirq_vec);
218 h->action(h); 219 h->action(h);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab462b9968d5..98e02328c67d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
335 .data = &sysctl_timer_migration, 335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 336 .maxlen = sizeof(unsigned int),
337 .mode = 0644, 337 .mode = 0644,
338 .proc_handler = &proc_dointvec, 338 .proc_handler = &proc_dointvec_minmax,
339 .strategy = &sysctl_intvec,
340 .extra1 = &zero,
341 .extra2 = &one,
339 }, 342 },
340#endif 343#endif
341 { 344 {
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
744 .proc_handler = &proc_dointvec, 747 .proc_handler = &proc_dointvec,
745 }, 748 },
746 { 749 {
750 .ctl_name = CTL_UNNUMBERED,
751 .procname = "panic_on_io_nmi",
752 .data = &panic_on_io_nmi,
753 .maxlen = sizeof(int),
754 .mode = 0644,
755 .proc_handler = &proc_dointvec,
756 },
757 {
747 .ctl_name = KERN_BOOTLOADER_TYPE, 758 .ctl_name = KERN_BOOTLOADER_TYPE,
748 .procname = "bootloader_type", 759 .procname = "bootloader_type",
749 .data = &bootloader_type, 760 .data = &bootloader_type,
@@ -2283,7 +2294,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2283 void *data) 2294 void *data)
2284{ 2295{
2285#define TMPBUFLEN 21 2296#define TMPBUFLEN 21
2286 int *i, vleft, first=1, neg, val; 2297 int *i, vleft, first = 1, neg;
2287 unsigned long lval; 2298 unsigned long lval;
2288 size_t left, len; 2299 size_t left, len;
2289 2300
@@ -2336,8 +2347,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2336 len = p-buf; 2347 len = p-buf;
2337 if ((len < left) && *p && !isspace(*p)) 2348 if ((len < left) && *p && !isspace(*p))
2338 break; 2349 break;
2339 if (neg)
2340 val = -val;
2341 s += len; 2350 s += len;
2342 left -= len; 2351 left -= len;
2343 2352
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..a6dcd67b041d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg)
254 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
255} 255}
256EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 257#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2aff39c6f10c..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
222 222
223 cpu = smp_processor_id(); 223 cpu = smp_processor_id();
224 ts = &per_cpu(tick_cpu_sched, cpu); 224 ts = &per_cpu(tick_cpu_sched, cpu);
225
226 /*
227 * Call to tick_nohz_start_idle stops the last_update_time from being
228 * updated. Thus, it must not be called in the event we are called from
229 * irq_exit() with the prior state different than idle.
230 */
231 if (!inidle && !ts->inidle)
232 goto end;
233
225 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
226 235
227 /* 236 /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
240 goto end; 249 goto end;
241 250
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1; 251 ts->inidle = 1;
246 252
247 if (need_resched()) 253 if (need_resched())
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..0b36b9e5cc8b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61071fecc82e..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
18config HAVE_FUNCTION_GRAPH_TRACER 18config HAVE_FUNCTION_GRAPH_TRACER
19 bool 19 bool
20 20
21config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool
23 help
24 An arch may pass in a unique value (frame pointer) to both the
25 entering and exiting of a function. On exit, the value is compared
26 and if it does not match, then it will panic the kernel.
27
21config HAVE_FUNCTION_TRACE_MCOUNT_TEST 28config HAVE_FUNCTION_TRACE_MCOUNT_TEST
22 bool 29 bool
23 help 30 help
@@ -121,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
121 bool "Kernel Function Graph Tracer" 128 bool "Kernel Function Graph Tracer"
122 depends on HAVE_FUNCTION_GRAPH_TRACER 129 depends on HAVE_FUNCTION_GRAPH_TRACER
123 depends on FUNCTION_TRACER 130 depends on FUNCTION_TRACER
131 depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
124 default y 132 default y
125 help 133 help
126 Enable the kernel to trace a function at both its return 134 Enable the kernel to trace a function at both its return
@@ -218,13 +226,13 @@ config BOOT_TRACER
218 the timings of the initcalls and traces key events and the identity 226 the timings of the initcalls and traces key events and the identity
219 of tasks that can cause boot delays, such as context-switches. 227 of tasks that can cause boot delays, such as context-switches.
220 228
221 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 229 Its aim is to be parsed by the scripts/bootgraph.pl tool to
222 produce pretty graphics about boot inefficiencies, giving a visual 230 produce pretty graphics about boot inefficiencies, giving a visual
223 representation of the delays during initcalls - but the raw 231 representation of the delays during initcalls - but the raw
224 /debug/tracing/trace text output is readable too. 232 /debug/tracing/trace text output is readable too.
225 233
226 You must pass in ftrace=initcall to the kernel command line 234 You must pass in initcall_debug and ftrace=initcall to the kernel
227 to enable this on bootup. 235 command line to enable this on bootup.
228 236
229config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
230 bool 238 bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..1090b0aed9ba 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb60732ade0c..4521c77d1a1a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -766,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
766 .stat_show = function_stat_show 768 .stat_show = function_stat_show
767}; 769};
768 770
769static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
770{ 772{
771 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
772 struct dentry *entry; 774 struct dentry *entry;
@@ -784,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
784 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
785 * we still do not free memory. 787 * we still do not free memory.
786 */ 788 */
787 kfree(stat);
788 WARN(1, 789 WARN(1,
789 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
790 cpu); 791 cpu);
@@ -811,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
811} 812}
812 813
813#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
815{ 816{
816} 817}
817#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1224,6 +1225,13 @@ static void ftrace_shutdown(int command)
1224 return; 1225 return;
1225 1226
1226 ftrace_start_up--; 1227 ftrace_start_up--;
1228 /*
1229 * Just warn in case of unbalance, no need to kill ftrace, it's not
1230 * critical but the ftrace_call callers may be never nopped again after
1231 * further ftrace uses.
1232 */
1233 WARN_ON_ONCE(ftrace_start_up < 0);
1234
1227 if (!ftrace_start_up) 1235 if (!ftrace_start_up)
1228 command |= FTRACE_DISABLE_CALLS; 1236 command |= FTRACE_DISABLE_CALLS;
1229 1237
@@ -1410,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410{ 1418{
1411 struct ftrace_iterator *iter = m->private; 1419 struct ftrace_iterator *iter = m->private;
1412 void *p = NULL; 1420 void *p = NULL;
1421 loff_t l;
1422
1423 if (!(iter->flags & FTRACE_ITER_HASH))
1424 *pos = 0;
1413 1425
1414 iter->flags |= FTRACE_ITER_HASH; 1426 iter->flags |= FTRACE_ITER_HASH;
1415 1427
1416 return t_hash_next(m, p, pos); 1428 iter->hidx = 0;
1429 for (l = 0; l <= *pos; ) {
1430 p = t_hash_next(m, p, &l);
1431 if (!p)
1432 break;
1433 }
1434 return p;
1417} 1435}
1418 1436
1419static int t_hash_show(struct seq_file *m, void *v) 1437static int t_hash_show(struct seq_file *m, void *v)
@@ -1460,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1460 iter->pg = iter->pg->next; 1478 iter->pg = iter->pg->next;
1461 iter->idx = 0; 1479 iter->idx = 0;
1462 goto retry; 1480 goto retry;
1463 } else {
1464 iter->idx = -1;
1465 } 1481 }
1466 } else { 1482 } else {
1467 rec = &iter->pg->records[iter->idx++]; 1483 rec = &iter->pg->records[iter->idx++];
@@ -1490,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1490{ 1506{
1491 struct ftrace_iterator *iter = m->private; 1507 struct ftrace_iterator *iter = m->private;
1492 void *p = NULL; 1508 void *p = NULL;
1509 loff_t l;
1493 1510
1494 mutex_lock(&ftrace_lock); 1511 mutex_lock(&ftrace_lock);
1495 /* 1512 /*
@@ -1501,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1501 if (*pos > 0) 1518 if (*pos > 0)
1502 return t_hash_start(m, pos); 1519 return t_hash_start(m, pos);
1503 iter->flags |= FTRACE_ITER_PRINTALL; 1520 iter->flags |= FTRACE_ITER_PRINTALL;
1504 (*pos)++;
1505 return iter; 1521 return iter;
1506 } 1522 }
1507 1523
1508 if (iter->flags & FTRACE_ITER_HASH) 1524 if (iter->flags & FTRACE_ITER_HASH)
1509 return t_hash_start(m, pos); 1525 return t_hash_start(m, pos);
1510 1526
1511 if (*pos > 0) { 1527 iter->pg = ftrace_pages_start;
1512 if (iter->idx < 0) 1528 iter->idx = 0;
1513 return p; 1529 for (l = 0; l <= *pos; ) {
1514 (*pos)--; 1530 p = t_next(m, p, &l);
1515 iter->idx--; 1531 if (!p)
1532 break;
1516 } 1533 }
1517 1534
1518 p = t_next(m, p, pos); 1535 if (!p && iter->flags & FTRACE_ITER_FILTER)
1519
1520 if (!p)
1521 return t_hash_start(m, pos); 1536 return t_hash_start(m, pos);
1522 1537
1523 return p; 1538 return p;
@@ -2493,32 +2508,31 @@ int ftrace_graph_count;
2493unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2508unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2494 2509
2495static void * 2510static void *
2496g_next(struct seq_file *m, void *v, loff_t *pos) 2511__g_next(struct seq_file *m, loff_t *pos)
2497{ 2512{
2498 unsigned long *array = m->private; 2513 unsigned long *array = m->private;
2499 int index = *pos;
2500
2501 (*pos)++;
2502 2514
2503 if (index >= ftrace_graph_count) 2515 if (*pos >= ftrace_graph_count)
2504 return NULL; 2516 return NULL;
2517 return &array[*pos];
2518}
2505 2519
2506 return &array[index]; 2520static void *
2521g_next(struct seq_file *m, void *v, loff_t *pos)
2522{
2523 (*pos)++;
2524 return __g_next(m, pos);
2507} 2525}
2508 2526
2509static void *g_start(struct seq_file *m, loff_t *pos) 2527static void *g_start(struct seq_file *m, loff_t *pos)
2510{ 2528{
2511 void *p = NULL;
2512
2513 mutex_lock(&graph_lock); 2529 mutex_lock(&graph_lock);
2514 2530
2515 /* Nothing, tell g_show to print all functions are enabled */ 2531 /* Nothing, tell g_show to print all functions are enabled */
2516 if (!ftrace_graph_count && !*pos) 2532 if (!ftrace_graph_count && !*pos)
2517 return (void *)1; 2533 return (void *)1;
2518 2534
2519 p = g_next(m, p, pos); 2535 return __g_next(m, pos);
2520
2521 return p;
2522} 2536}
2523 2537
2524static void g_stop(struct seq_file *m, void *p) 2538static void g_stop(struct seq_file *m, void *p)
@@ -3145,10 +3159,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3145 3159
3146 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3160 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3147 3161
3148 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3162 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3149 goto out; 3163 goto out;
3150 3164
3151 last_ftrace_enabled = ftrace_enabled; 3165 last_ftrace_enabled = !!ftrace_enabled;
3152 3166
3153 if (ftrace_enabled) { 3167 if (ftrace_enabled) {
3154 3168
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 86cdf671d7e2..1edaa9516e81 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -186,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
186 int cpu; 186 int cpu;
187 kmemtrace_array = tr; 187 kmemtrace_array = tr;
188 188
189 for_each_cpu_mask(cpu, cpu_possible_map) 189 for_each_cpu(cpu, cpu_possible_mask)
190 tracing_reset(tr, cpu); 190 tracing_reset(tr, cpu);
191 191
192 kmemtrace_start_probes(); 192 kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc4dc70171ce..bf27bb7a63e2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
209 210
210/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
211#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -415,6 +416,8 @@ struct ring_buffer_per_cpu {
415 unsigned long overrun; 416 unsigned long overrun;
416 unsigned long read; 417 unsigned long read;
417 local_t entries; 418 local_t entries;
419 local_t committing;
420 local_t commits;
418 u64 write_stamp; 421 u64 write_stamp;
419 u64 read_stamp; 422 u64 read_stamp;
420 atomic_t record_disabled; 423 atomic_t record_disabled;
@@ -618,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
618 kfree(cpu_buffer); 621 kfree(cpu_buffer);
619} 622}
620 623
621/*
622 * Causes compile errors if the struct buffer_page gets bigger
623 * than the struct page.
624 */
625extern int ring_buffer_page_too_big(void);
626
627#ifdef CONFIG_HOTPLUG_CPU 624#ifdef CONFIG_HOTPLUG_CPU
628static int rb_cpu_notify(struct notifier_block *self, 625static int rb_cpu_notify(struct notifier_block *self,
629 unsigned long action, void *hcpu); 626 unsigned long action, void *hcpu);
@@ -646,11 +643,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
646 int bsize; 643 int bsize;
647 int cpu; 644 int cpu;
648 645
649 /* Paranoid! Optimizes out when all is well */
650 if (sizeof(struct buffer_page) > sizeof(struct page))
651 ring_buffer_page_too_big();
652
653
654 /* keep it in its own cache line */ 646 /* keep it in its own cache line */
655 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 647 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
656 GFP_KERNEL); 648 GFP_KERNEL);
@@ -666,8 +658,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
666 buffer->reader_lock_key = key; 658 buffer->reader_lock_key = key;
667 659
668 /* need at least two pages */ 660 /* need at least two pages */
669 if (buffer->pages == 1) 661 if (buffer->pages < 2)
670 buffer->pages++; 662 buffer->pages = 2;
671 663
672 /* 664 /*
673 * In case of non-hotplug cpu, if the ring-buffer is allocated 665 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1011,12 +1003,12 @@ rb_event_index(struct ring_buffer_event *event)
1011{ 1003{
1012 unsigned long addr = (unsigned long)event; 1004 unsigned long addr = (unsigned long)event;
1013 1005
1014 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1006 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1015} 1007}
1016 1008
1017static inline int 1009static inline int
1018rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1010rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1019 struct ring_buffer_event *event) 1011 struct ring_buffer_event *event)
1020{ 1012{
1021 unsigned long addr = (unsigned long)event; 1013 unsigned long addr = (unsigned long)event;
1022 unsigned long index; 1014 unsigned long index;
@@ -1029,31 +1021,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1029} 1021}
1030 1022
1031static void 1023static void
1032rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1033 struct ring_buffer_event *event)
1034{
1035 unsigned long addr = (unsigned long)event;
1036 unsigned long index;
1037
1038 index = rb_event_index(event);
1039 addr &= PAGE_MASK;
1040
1041 while (cpu_buffer->commit_page->page != (void *)addr) {
1042 if (RB_WARN_ON(cpu_buffer,
1043 cpu_buffer->commit_page == cpu_buffer->tail_page))
1044 return;
1045 cpu_buffer->commit_page->page->commit =
1046 cpu_buffer->commit_page->write;
1047 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1048 cpu_buffer->write_stamp =
1049 cpu_buffer->commit_page->page->time_stamp;
1050 }
1051
1052 /* Now set the commit to the event's index */
1053 local_set(&cpu_buffer->commit_page->page->commit, index);
1054}
1055
1056static void
1057rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1024rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1058{ 1025{
1059 /* 1026 /*
@@ -1171,6 +1138,60 @@ static unsigned rb_calculate_event_length(unsigned length)
1171 return length; 1138 return length;
1172} 1139}
1173 1140
1141static inline void
1142rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1143 struct buffer_page *tail_page,
1144 unsigned long tail, unsigned long length)
1145{
1146 struct ring_buffer_event *event;
1147
1148 /*
1149 * Only the event that crossed the page boundary
1150 * must fill the old tail_page with padding.
1151 */
1152 if (tail >= BUF_PAGE_SIZE) {
1153 local_sub(length, &tail_page->write);
1154 return;
1155 }
1156
1157 event = __rb_page_index(tail_page, tail);
1158 kmemcheck_annotate_bitfield(event, bitfield);
1159
1160 /*
1161 * If this event is bigger than the minimum size, then
1162 * we need to be careful that we don't subtract the
1163 * write counter enough to allow another writer to slip
1164 * in on this page.
1165 * We put in a discarded commit instead, to make sure
1166 * that this space is not used again.
1167 *
1168 * If we are less than the minimum size, we don't need to
1169 * worry about it.
1170 */
1171 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1172 /* No room for any events */
1173
1174 /* Mark the rest of the page with padding */
1175 rb_event_set_padding(event);
1176
1177 /* Set the write back to the previous setting */
1178 local_sub(length, &tail_page->write);
1179 return;
1180 }
1181
1182 /* Put in a discarded event */
1183 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1184 event->type_len = RINGBUF_TYPE_PADDING;
1185 /* time delta must be non zero */
1186 event->time_delta = 1;
1187 /* Account for this as an entry */
1188 local_inc(&tail_page->entries);
1189 local_inc(&cpu_buffer->entries);
1190
1191 /* Set write to end of buffer */
1192 length = (tail + length) - BUF_PAGE_SIZE;
1193 local_sub(length, &tail_page->write);
1194}
1174 1195
1175static struct ring_buffer_event * 1196static struct ring_buffer_event *
1176rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1197rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1180,7 +1201,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1180{ 1201{
1181 struct buffer_page *next_page, *head_page, *reader_page; 1202 struct buffer_page *next_page, *head_page, *reader_page;
1182 struct ring_buffer *buffer = cpu_buffer->buffer; 1203 struct ring_buffer *buffer = cpu_buffer->buffer;
1183 struct ring_buffer_event *event;
1184 bool lock_taken = false; 1204 bool lock_taken = false;
1185 unsigned long flags; 1205 unsigned long flags;
1186 1206
@@ -1265,27 +1285,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1265 cpu_buffer->tail_page->page->time_stamp = *ts; 1285 cpu_buffer->tail_page->page->time_stamp = *ts;
1266 } 1286 }
1267 1287
1268 /* 1288 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1269 * The actual tail page has moved forward.
1270 */
1271 if (tail < BUF_PAGE_SIZE) {
1272 /* Mark the rest of the page with padding */
1273 event = __rb_page_index(tail_page, tail);
1274 kmemcheck_annotate_bitfield(event, bitfield);
1275 rb_event_set_padding(event);
1276 }
1277
1278 /* Set the write back to the previous setting */
1279 local_sub(length, &tail_page->write);
1280
1281 /*
1282 * If this was a commit entry that failed,
1283 * increment that too
1284 */
1285 if (tail_page == cpu_buffer->commit_page &&
1286 tail == rb_commit_index(cpu_buffer)) {
1287 rb_set_commit_to_write(cpu_buffer);
1288 }
1289 1289
1290 __raw_spin_unlock(&cpu_buffer->lock); 1290 __raw_spin_unlock(&cpu_buffer->lock);
1291 local_irq_restore(flags); 1291 local_irq_restore(flags);
@@ -1295,7 +1295,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1295 1295
1296 out_reset: 1296 out_reset:
1297 /* reset write */ 1297 /* reset write */
1298 local_sub(length, &tail_page->write); 1298 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299 1299
1300 if (likely(lock_taken)) 1300 if (likely(lock_taken))
1301 __raw_spin_unlock(&cpu_buffer->lock); 1301 __raw_spin_unlock(&cpu_buffer->lock);
@@ -1325,9 +1325,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1325 1325
1326 /* We reserved something on the buffer */ 1326 /* We reserved something on the buffer */
1327 1327
1328 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1329 return NULL;
1330
1331 event = __rb_page_index(tail_page, tail); 1328 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield); 1329 kmemcheck_annotate_bitfield(event, bitfield);
1333 rb_update_event(event, type, length); 1330 rb_update_event(event, type, length);
@@ -1337,11 +1334,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1337 local_inc(&tail_page->entries); 1334 local_inc(&tail_page->entries);
1338 1335
1339 /* 1336 /*
1340 * If this is a commit and the tail is zero, then update 1337 * If this is the first commit on the page, then update
1341 * this page's time stamp. 1338 * its timestamp.
1342 */ 1339 */
1343 if (!tail && rb_is_commit(cpu_buffer, event)) 1340 if (!tail)
1344 cpu_buffer->commit_page->page->time_stamp = *ts; 1341 tail_page->page->time_stamp = *ts;
1345 1342
1346 return event; 1343 return event;
1347} 1344}
@@ -1410,16 +1407,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1410 return -EAGAIN; 1407 return -EAGAIN;
1411 1408
1412 /* Only a commited time event can update the write stamp */ 1409 /* Only a commited time event can update the write stamp */
1413 if (rb_is_commit(cpu_buffer, event)) { 1410 if (rb_event_is_commit(cpu_buffer, event)) {
1414 /* 1411 /*
1415 * If this is the first on the page, then we need to 1412 * If this is the first on the page, then it was
1416 * update the page itself, and just put in a zero. 1413 * updated with the page itself. Try to discard it
1414 * and if we can't just make it zero.
1417 */ 1415 */
1418 if (rb_event_index(event)) { 1416 if (rb_event_index(event)) {
1419 event->time_delta = *delta & TS_MASK; 1417 event->time_delta = *delta & TS_MASK;
1420 event->array[0] = *delta >> TS_SHIFT; 1418 event->array[0] = *delta >> TS_SHIFT;
1421 } else { 1419 } else {
1422 cpu_buffer->commit_page->page->time_stamp = *ts;
1423 /* try to discard, since we do not need this */ 1420 /* try to discard, since we do not need this */
1424 if (!rb_try_to_discard(cpu_buffer, event)) { 1421 if (!rb_try_to_discard(cpu_buffer, event)) {
1425 /* nope, just zero it */ 1422 /* nope, just zero it */
@@ -1445,6 +1442,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1445 return ret; 1442 return ret;
1446} 1443}
1447 1444
1445static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
1446{
1447 local_inc(&cpu_buffer->committing);
1448 local_inc(&cpu_buffer->commits);
1449}
1450
1451static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1452{
1453 unsigned long commits;
1454
1455 if (RB_WARN_ON(cpu_buffer,
1456 !local_read(&cpu_buffer->committing)))
1457 return;
1458
1459 again:
1460 commits = local_read(&cpu_buffer->commits);
1461 /* synchronize with interrupts */
1462 barrier();
1463 if (local_read(&cpu_buffer->committing) == 1)
1464 rb_set_commit_to_write(cpu_buffer);
1465
1466 local_dec(&cpu_buffer->committing);
1467
1468 /* synchronize with interrupts */
1469 barrier();
1470
1471 /*
1472 * Need to account for interrupts coming in between the
1473 * updating of the commit page and the clearing of the
1474 * committing counter.
1475 */
1476 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
1477 !local_read(&cpu_buffer->committing)) {
1478 local_inc(&cpu_buffer->committing);
1479 goto again;
1480 }
1481}
1482
1448static struct ring_buffer_event * 1483static struct ring_buffer_event *
1449rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1484rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1450 unsigned long length) 1485 unsigned long length)
@@ -1454,6 +1489,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1454 int commit = 0; 1489 int commit = 0;
1455 int nr_loops = 0; 1490 int nr_loops = 0;
1456 1491
1492 rb_start_commit(cpu_buffer);
1493
1457 length = rb_calculate_event_length(length); 1494 length = rb_calculate_event_length(length);
1458 again: 1495 again:
1459 /* 1496 /*
@@ -1466,7 +1503,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1466 * Bail! 1503 * Bail!
1467 */ 1504 */
1468 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1505 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1469 return NULL; 1506 goto out_fail;
1470 1507
1471 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 1508 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1472 1509
@@ -1497,7 +1534,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1497 1534
1498 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1535 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1499 if (commit == -EBUSY) 1536 if (commit == -EBUSY)
1500 return NULL; 1537 goto out_fail;
1501 1538
1502 if (commit == -EAGAIN) 1539 if (commit == -EAGAIN)
1503 goto again; 1540 goto again;
@@ -1511,30 +1548,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1511 if (unlikely(PTR_ERR(event) == -EAGAIN)) 1548 if (unlikely(PTR_ERR(event) == -EAGAIN))
1512 goto again; 1549 goto again;
1513 1550
1514 if (!event) { 1551 if (!event)
1515 if (unlikely(commit)) 1552 goto out_fail;
1516 /*
1517 * Ouch! We needed a timestamp and it was commited. But
1518 * we didn't get our event reserved.
1519 */
1520 rb_set_commit_to_write(cpu_buffer);
1521 return NULL;
1522 }
1523 1553
1524 /* 1554 if (!rb_event_is_commit(cpu_buffer, event))
1525 * If the timestamp was commited, make the commit our entry
1526 * now so that we will update it when needed.
1527 */
1528 if (unlikely(commit))
1529 rb_set_commit_event(cpu_buffer, event);
1530 else if (!rb_is_commit(cpu_buffer, event))
1531 delta = 0; 1555 delta = 0;
1532 1556
1533 event->time_delta = delta; 1557 event->time_delta = delta;
1534 1558
1535 return event; 1559 return event;
1560
1561 out_fail:
1562 rb_end_commit(cpu_buffer);
1563 return NULL;
1536} 1564}
1537 1565
1566#ifdef CONFIG_TRACING
1567
1538#define TRACE_RECURSIVE_DEPTH 16 1568#define TRACE_RECURSIVE_DEPTH 16
1539 1569
1540static int trace_recursive_lock(void) 1570static int trace_recursive_lock(void)
@@ -1565,6 +1595,13 @@ static void trace_recursive_unlock(void)
1565 current->trace_recursion--; 1595 current->trace_recursion--;
1566} 1596}
1567 1597
1598#else
1599
1600#define trace_recursive_lock() (0)
1601#define trace_recursive_unlock() do { } while (0)
1602
1603#endif
1604
1568static DEFINE_PER_CPU(int, rb_need_resched); 1605static DEFINE_PER_CPU(int, rb_need_resched);
1569 1606
1570/** 1607/**
@@ -1642,13 +1679,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1642{ 1679{
1643 local_inc(&cpu_buffer->entries); 1680 local_inc(&cpu_buffer->entries);
1644 1681
1645 /* Only process further if we own the commit */ 1682 /*
1646 if (!rb_is_commit(cpu_buffer, event)) 1683 * The event first in the commit queue updates the
1647 return; 1684 * time stamp.
1648 1685 */
1649 cpu_buffer->write_stamp += event->time_delta; 1686 if (rb_event_is_commit(cpu_buffer, event))
1687 cpu_buffer->write_stamp += event->time_delta;
1650 1688
1651 rb_set_commit_to_write(cpu_buffer); 1689 rb_end_commit(cpu_buffer);
1652} 1690}
1653 1691
1654/** 1692/**
@@ -1737,15 +1775,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1737 /* The event is discarded regardless */ 1775 /* The event is discarded regardless */
1738 rb_event_discard(event); 1776 rb_event_discard(event);
1739 1777
1778 cpu = smp_processor_id();
1779 cpu_buffer = buffer->buffers[cpu];
1780
1740 /* 1781 /*
1741 * This must only be called if the event has not been 1782 * This must only be called if the event has not been
1742 * committed yet. Thus we can assume that preemption 1783 * committed yet. Thus we can assume that preemption
1743 * is still disabled. 1784 * is still disabled.
1744 */ 1785 */
1745 RB_WARN_ON(buffer, preemptible()); 1786 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1746
1747 cpu = smp_processor_id();
1748 cpu_buffer = buffer->buffers[cpu];
1749 1787
1750 if (!rb_try_to_discard(cpu_buffer, event)) 1788 if (!rb_try_to_discard(cpu_buffer, event))
1751 goto out; 1789 goto out;
@@ -1756,13 +1794,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1756 */ 1794 */
1757 local_inc(&cpu_buffer->entries); 1795 local_inc(&cpu_buffer->entries);
1758 out: 1796 out:
1759 /* 1797 rb_end_commit(cpu_buffer);
1760 * If a write came in and pushed the tail page
1761 * we still need to update the commit pointer
1762 * if we were the commit.
1763 */
1764 if (rb_is_commit(cpu_buffer, event))
1765 rb_set_commit_to_write(cpu_buffer);
1766 1798
1767 trace_recursive_unlock(); 1799 trace_recursive_unlock();
1768 1800
@@ -2446,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2446} 2478}
2447EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 2479EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2448 2480
2481static inline int rb_ok_to_lock(void)
2482{
2483 /*
2484 * If an NMI die dumps out the content of the ring buffer
2485 * do not grab locks. We also permanently disable the ring
2486 * buffer too. A one time deal is all you get from reading
2487 * the ring buffer from an NMI.
2488 */
2489 if (likely(!in_nmi() && !oops_in_progress))
2490 return 1;
2491
2492 tracing_off_permanent();
2493 return 0;
2494}
2495
2449/** 2496/**
2450 * ring_buffer_peek - peek at the next event to be read 2497 * ring_buffer_peek - peek at the next event to be read
2451 * @buffer: The ring buffer to read 2498 * @buffer: The ring buffer to read
@@ -2461,14 +2508,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2461 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2508 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2462 struct ring_buffer_event *event; 2509 struct ring_buffer_event *event;
2463 unsigned long flags; 2510 unsigned long flags;
2511 int dolock;
2464 2512
2465 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2513 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2466 return NULL; 2514 return NULL;
2467 2515
2516 dolock = rb_ok_to_lock();
2468 again: 2517 again:
2469 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2518 local_irq_save(flags);
2519 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock);
2470 event = rb_buffer_peek(buffer, cpu, ts); 2521 event = rb_buffer_peek(buffer, cpu, ts);
2471 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2522 if (dolock)
2523 spin_unlock(&cpu_buffer->reader_lock);
2524 local_irq_restore(flags);
2472 2525
2473 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 2526 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2474 cpu_relax(); 2527 cpu_relax();
@@ -2520,6 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2520 struct ring_buffer_per_cpu *cpu_buffer; 2573 struct ring_buffer_per_cpu *cpu_buffer;
2521 struct ring_buffer_event *event = NULL; 2574 struct ring_buffer_event *event = NULL;
2522 unsigned long flags; 2575 unsigned long flags;
2576 int dolock;
2577
2578 dolock = rb_ok_to_lock();
2523 2579
2524 again: 2580 again:
2525 /* might be called in atomic */ 2581 /* might be called in atomic */
@@ -2529,7 +2585,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2529 goto out; 2585 goto out;
2530 2586
2531 cpu_buffer = buffer->buffers[cpu]; 2587 cpu_buffer = buffer->buffers[cpu];
2532 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2588 local_irq_save(flags);
2589 if (dolock)
2590 spin_lock(&cpu_buffer->reader_lock);
2533 2591
2534 event = rb_buffer_peek(buffer, cpu, ts); 2592 event = rb_buffer_peek(buffer, cpu, ts);
2535 if (!event) 2593 if (!event)
@@ -2538,7 +2596,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2538 rb_advance_reader(cpu_buffer); 2596 rb_advance_reader(cpu_buffer);
2539 2597
2540 out_unlock: 2598 out_unlock:
2541 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2599 if (dolock)
2600 spin_unlock(&cpu_buffer->reader_lock);
2601 local_irq_restore(flags);
2542 2602
2543 out: 2603 out:
2544 preempt_enable(); 2604 preempt_enable();
@@ -2680,6 +2740,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2680 cpu_buffer->overrun = 0; 2740 cpu_buffer->overrun = 0;
2681 cpu_buffer->read = 0; 2741 cpu_buffer->read = 0;
2682 local_set(&cpu_buffer->entries, 0); 2742 local_set(&cpu_buffer->entries, 0);
2743 local_set(&cpu_buffer->committing, 0);
2744 local_set(&cpu_buffer->commits, 0);
2683 2745
2684 cpu_buffer->write_stamp = 0; 2746 cpu_buffer->write_stamp = 0;
2685 cpu_buffer->read_stamp = 0; 2747 cpu_buffer->read_stamp = 0;
@@ -2734,12 +2796,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2734int ring_buffer_empty(struct ring_buffer *buffer) 2796int ring_buffer_empty(struct ring_buffer *buffer)
2735{ 2797{
2736 struct ring_buffer_per_cpu *cpu_buffer; 2798 struct ring_buffer_per_cpu *cpu_buffer;
2799 unsigned long flags;
2800 int dolock;
2737 int cpu; 2801 int cpu;
2802 int ret;
2803
2804 dolock = rb_ok_to_lock();
2738 2805
2739 /* yes this is racy, but if you don't like the race, lock the buffer */ 2806 /* yes this is racy, but if you don't like the race, lock the buffer */
2740 for_each_buffer_cpu(buffer, cpu) { 2807 for_each_buffer_cpu(buffer, cpu) {
2741 cpu_buffer = buffer->buffers[cpu]; 2808 cpu_buffer = buffer->buffers[cpu];
2742 if (!rb_per_cpu_empty(cpu_buffer)) 2809 local_irq_save(flags);
2810 if (dolock)
2811 spin_lock(&cpu_buffer->reader_lock);
2812 ret = rb_per_cpu_empty(cpu_buffer);
2813 if (dolock)
2814 spin_unlock(&cpu_buffer->reader_lock);
2815 local_irq_restore(flags);
2816
2817 if (!ret)
2743 return 0; 2818 return 0;
2744 } 2819 }
2745 2820
@@ -2755,14 +2830,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2755int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2830int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2756{ 2831{
2757 struct ring_buffer_per_cpu *cpu_buffer; 2832 struct ring_buffer_per_cpu *cpu_buffer;
2833 unsigned long flags;
2834 int dolock;
2758 int ret; 2835 int ret;
2759 2836
2760 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2837 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2761 return 1; 2838 return 1;
2762 2839
2840 dolock = rb_ok_to_lock();
2841
2763 cpu_buffer = buffer->buffers[cpu]; 2842 cpu_buffer = buffer->buffers[cpu];
2843 local_irq_save(flags);
2844 if (dolock)
2845 spin_lock(&cpu_buffer->reader_lock);
2764 ret = rb_per_cpu_empty(cpu_buffer); 2846 ret = rb_per_cpu_empty(cpu_buffer);
2765 2847 if (dolock)
2848 spin_unlock(&cpu_buffer->reader_lock);
2849 local_irq_restore(flags);
2766 2850
2767 return ret; 2851 return ret;
2768} 2852}
@@ -3029,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3029} 3113}
3030EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3114EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3031 3115
3116#ifdef CONFIG_TRACING
3032static ssize_t 3117static ssize_t
3033rb_simple_read(struct file *filp, char __user *ubuf, 3118rb_simple_read(struct file *filp, char __user *ubuf,
3034 size_t cnt, loff_t *ppos) 3119 size_t cnt, loff_t *ppos)
@@ -3096,6 +3181,7 @@ static __init int rb_init_debugfs(void)
3096} 3181}
3097 3182
3098fs_initcall(rb_init_debugfs); 3183fs_initcall(rb_init_debugfs);
3184#endif
3099 3185
3100#ifdef CONFIG_HOTPLUG_CPU 3186#ifdef CONFIG_HOTPLUG_CPU
3101static int rb_cpu_notify(struct notifier_block *self, 3187static int rb_cpu_notify(struct notifier_block *self,
@@ -3108,7 +3194,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3108 switch (action) { 3194 switch (action) {
3109 case CPU_UP_PREPARE: 3195 case CPU_UP_PREPARE:
3110 case CPU_UP_PREPARE_FROZEN: 3196 case CPU_UP_PREPARE_FROZEN:
3111 if (cpu_isset(cpu, *buffer->cpumask)) 3197 if (cpumask_test_cpu(cpu, buffer->cpumask))
3112 return NOTIFY_OK; 3198 return NOTIFY_OK;
3113 3199
3114 buffer->buffers[cpu] = 3200 buffer->buffers[cpu] =
@@ -3119,7 +3205,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3119 return NOTIFY_OK; 3205 return NOTIFY_OK;
3120 } 3206 }
3121 smp_wmb(); 3207 smp_wmb();
3122 cpu_set(cpu, *buffer->cpumask); 3208 cpumask_set_cpu(cpu, buffer->cpumask);
3123 break; 3209 break;
3124 case CPU_DOWN_PREPARE: 3210 case CPU_DOWN_PREPARE:
3125 case CPU_DOWN_PREPARE_FROZEN: 3211 case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8d68e149a8b3..573d3cc762c3 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -102,8 +102,10 @@ static enum event_status read_page(int cpu)
102 event = (void *)&rpage->data[i]; 102 event = (void *)&rpage->data[i];
103 switch (event->type_len) { 103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING: 104 case RINGBUF_TYPE_PADDING:
105 /* We don't expect any padding */ 105 /* failed writes may be discarded events */
106 KILL_TEST(); 106 if (!event->time_delta)
107 KILL_TEST();
108 inc = event->array[0] + 4;
107 break; 109 break;
108 case RINGBUF_TYPE_TIME_EXTEND: 110 case RINGBUF_TYPE_TIME_EXTEND:
109 inc = 8; 111 inc = 8;
@@ -119,7 +121,7 @@ static enum event_status read_page(int cpu)
119 KILL_TEST(); 121 KILL_TEST();
120 break; 122 break;
121 } 123 }
122 inc = event->array[0]; 124 inc = event->array[0] + 4;
123 break; 125 break;
124 default: 126 default:
125 entry = ring_buffer_event_data(event); 127 entry = ring_buffer_event_data(event);
@@ -201,7 +203,7 @@ static void ring_buffer_producer(void)
201 * Hammer the buffer for 10 secs (this may 203 * Hammer the buffer for 10 secs (this may
202 * make the system stall) 204 * make the system stall)
203 */ 205 */
204 pr_info("Starting ring buffer hammer\n"); 206 trace_printk("Starting ring buffer hammer\n");
205 do_gettimeofday(&start_tv); 207 do_gettimeofday(&start_tv);
206 do { 208 do {
207 struct ring_buffer_event *event; 209 struct ring_buffer_event *event;
@@ -237,7 +239,7 @@ static void ring_buffer_producer(void)
237#endif 239#endif
238 240
239 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 241 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
240 pr_info("End ring buffer hammer\n"); 242 trace_printk("End ring buffer hammer\n");
241 243
242 if (consumer) { 244 if (consumer) {
243 /* Init both completions here to avoid races */ 245 /* Init both completions here to avoid races */
@@ -260,49 +262,50 @@ static void ring_buffer_producer(void)
260 overruns = ring_buffer_overruns(buffer); 262 overruns = ring_buffer_overruns(buffer);
261 263
262 if (kill_test) 264 if (kill_test)
263 pr_info("ERROR!\n"); 265 trace_printk("ERROR!\n");
264 pr_info("Time: %lld (usecs)\n", time); 266 trace_printk("Time: %lld (usecs)\n", time);
265 pr_info("Overruns: %lld\n", overruns); 267 trace_printk("Overruns: %lld\n", overruns);
266 if (disable_reader) 268 if (disable_reader)
267 pr_info("Read: (reader disabled)\n"); 269 trace_printk("Read: (reader disabled)\n");
268 else 270 else
269 pr_info("Read: %ld (by %s)\n", read, 271 trace_printk("Read: %ld (by %s)\n", read,
270 read_events ? "events" : "pages"); 272 read_events ? "events" : "pages");
271 pr_info("Entries: %lld\n", entries); 273 trace_printk("Entries: %lld\n", entries);
272 pr_info("Total: %lld\n", entries + overruns + read); 274 trace_printk("Total: %lld\n", entries + overruns + read);
273 pr_info("Missed: %ld\n", missed); 275 trace_printk("Missed: %ld\n", missed);
274 pr_info("Hit: %ld\n", hit); 276 trace_printk("Hit: %ld\n", hit);
275 277
276 /* Convert time from usecs to millisecs */ 278 /* Convert time from usecs to millisecs */
277 do_div(time, USEC_PER_MSEC); 279 do_div(time, USEC_PER_MSEC);
278 if (time) 280 if (time)
279 hit /= (long)time; 281 hit /= (long)time;
280 else 282 else
281 pr_info("TIME IS ZERO??\n"); 283 trace_printk("TIME IS ZERO??\n");
282 284
283 pr_info("Entries per millisec: %ld\n", hit); 285 trace_printk("Entries per millisec: %ld\n", hit);
284 286
285 if (hit) { 287 if (hit) {
286 /* Calculate the average time in nanosecs */ 288 /* Calculate the average time in nanosecs */
287 avg = NSEC_PER_MSEC / hit; 289 avg = NSEC_PER_MSEC / hit;
288 pr_info("%ld ns per entry\n", avg); 290 trace_printk("%ld ns per entry\n", avg);
289 } 291 }
290 292
291 if (missed) { 293 if (missed) {
292 if (time) 294 if (time)
293 missed /= (long)time; 295 missed /= (long)time;
294 296
295 pr_info("Total iterations per millisec: %ld\n", hit + missed); 297 trace_printk("Total iterations per millisec: %ld\n",
298 hit + missed);
296 299
297 /* it is possible that hit + missed will overflow and be zero */ 300 /* it is possible that hit + missed will overflow and be zero */
298 if (!(hit + missed)) { 301 if (!(hit + missed)) {
299 pr_info("hit + missed overflowed and totalled zero!\n"); 302 trace_printk("hit + missed overflowed and totalled zero!\n");
300 hit--; /* make it non zero */ 303 hit--; /* make it non zero */
301 } 304 }
302 305
303 /* Caculate the average time in nanosecs */ 306 /* Caculate the average time in nanosecs */
304 avg = NSEC_PER_MSEC / (hit + missed); 307 avg = NSEC_PER_MSEC / (hit + missed);
305 pr_info("%ld ns per entry\n", avg); 308 trace_printk("%ld ns per entry\n", avg);
306 } 309 }
307} 310}
308 311
@@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg)
353 356
354 ring_buffer_producer(); 357 ring_buffer_producer();
355 358
356 pr_info("Sleeping for 10 secs\n"); 359 trace_printk("Sleeping for 10 secs\n");
357 set_current_state(TASK_INTERRUPTIBLE); 360 set_current_state(TASK_INTERRUPTIBLE);
358 schedule_timeout(HZ * SLEEP_TIME); 361 schedule_timeout(HZ * SLEEP_TIME);
359 __set_current_state(TASK_RUNNING); 362 __set_current_state(TASK_RUNNING);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c1878bfb2e1e..8bc8d8afea6a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -284,13 +285,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
285{ 286{
286 unsigned long buf_size; 287 unsigned long buf_size;
287 int ret;
288 288
289 if (!str) 289 if (!str)
290 return 0; 290 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 291 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 292 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 293 if (buf_size == 0)
294 return 0; 294 return 0;
295 trace_buf_size = buf_size; 295 trace_buf_size = buf_size;
296 return 1; 296 return 1;
@@ -2053,25 +2053,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2053static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2054t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2055{
2056 struct tracer *t = m->private; 2056 struct tracer *t = v;
2057 2057
2058 (*pos)++; 2058 (*pos)++;
2059 2059
2060 if (t) 2060 if (t)
2061 t = t->next; 2061 t = t->next;
2062 2062
2063 m->private = t;
2064
2065 return t; 2063 return t;
2066} 2064}
2067 2065
2068static void *t_start(struct seq_file *m, loff_t *pos) 2066static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2067{
2070 struct tracer *t = m->private; 2068 struct tracer *t;
2071 loff_t l = 0; 2069 loff_t l = 0;
2072 2070
2073 mutex_lock(&trace_types_lock); 2071 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2072 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2073 ;
2076 2074
2077 return t; 2075 return t;
@@ -2107,18 +2105,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2105
2108static int show_traces_open(struct inode *inode, struct file *file) 2106static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2107{
2110 int ret;
2111
2112 if (tracing_disabled) 2108 if (tracing_disabled)
2113 return -ENODEV; 2109 return -ENODEV;
2114 2110
2115 ret = seq_open(file, &show_traces_seq_ops); 2111 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2112}
2123 2113
2124static ssize_t 2114static ssize_t
@@ -2191,11 +2181,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2191 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 2181 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2192 return -ENOMEM; 2182 return -ENOMEM;
2193 2183
2194 mutex_lock(&tracing_cpumask_update_lock);
2195 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2184 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2196 if (err) 2185 if (err)
2197 goto err_unlock; 2186 goto err_unlock;
2198 2187
2188 mutex_lock(&tracing_cpumask_update_lock);
2189
2199 local_irq_disable(); 2190 local_irq_disable();
2200 __raw_spin_lock(&ftrace_max_lock); 2191 __raw_spin_lock(&ftrace_max_lock);
2201 for_each_tracing_cpu(cpu) { 2192 for_each_tracing_cpu(cpu) {
@@ -2223,8 +2214,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2223 return count; 2214 return count;
2224 2215
2225err_unlock: 2216err_unlock:
2226 mutex_unlock(&tracing_cpumask_update_lock); 2217 free_cpumask_var(tracing_cpumask_new);
2227 free_cpumask_var(tracing_cpumask);
2228 2218
2229 return err; 2219 return err;
2230} 2220}
@@ -3626,7 +3616,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3626 struct trace_seq *s; 3616 struct trace_seq *s;
3627 unsigned long cnt; 3617 unsigned long cnt;
3628 3618
3629 s = kmalloc(sizeof(*s), GFP_ATOMIC); 3619 s = kmalloc(sizeof(*s), GFP_KERNEL);
3630 if (!s) 3620 if (!s)
3631 return ENOMEM; 3621 return ENOMEM;
3632 3622
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..3548ae5cc780 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter)
597 597
598extern struct pid *ftrace_pid_trace; 598extern struct pid *ftrace_pid_trace;
599 599
600#ifdef CONFIG_FUNCTION_TRACER
600static inline int ftrace_trace_task(struct task_struct *task) 601static inline int ftrace_trace_task(struct task_struct *task)
601{ 602{
602 if (!ftrace_pid_trace) 603 if (!ftrace_pid_trace)
@@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
604 605
605 return test_tsk_trace_trace(task); 606 return test_tsk_trace_trace(task);
606} 607}
608#else
609static inline int ftrace_trace_task(struct task_struct *task)
610{
611 return 1;
612}
613#endif
607 614
608/* 615/*
609 * trace_iterator_flags is an enumeration that defines bit 616 * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..53c8fd376a88 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 300
301static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
302{ 302{
303 struct ftrace_event_call *call = NULL;
304 loff_t l;
305
303 mutex_lock(&event_mutex); 306 mutex_lock(&event_mutex);
304 if (*pos == 0) 307
305 m->private = ftrace_events.next; 308 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 309 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l);
311 if (!call)
312 break;
313 }
314 return call;
307} 315}
308 316
309static void * 317static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 340
333static void *s_start(struct seq_file *m, loff_t *pos) 341static void *s_start(struct seq_file *m, loff_t *pos)
334{ 342{
343 struct ftrace_event_call *call = NULL;
344 loff_t l;
345
335 mutex_lock(&event_mutex); 346 mutex_lock(&event_mutex);
336 if (*pos == 0) 347
337 m->private = ftrace_events.next; 348 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 349 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l);
351 if (!call)
352 break;
353 }
354 return call;
339} 355}
340 356
341static int t_show(struct seq_file *m, void *v) 357static int t_show(struct seq_file *m, void *v)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index db6e54bdb596..936c621bbf46 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,8 +27,6 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30static DEFINE_MUTEX(filter_mutex);
31
32enum filter_op_ids 30enum filter_op_ids
33{ 31{
34 OP_OR, 32 OP_OR,
@@ -178,7 +176,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
178static int filter_pred_strloc(struct filter_pred *pred, void *event, 176static int filter_pred_strloc(struct filter_pred *pred, void *event,
179 int val1, int val2) 177 int val1, int val2)
180{ 178{
181 int str_loc = *(int *)(event + pred->offset); 179 unsigned short str_loc = *(unsigned short *)(event + pred->offset);
182 char *addr = (char *)(event + str_loc); 180 char *addr = (char *)(event + str_loc);
183 int cmp, match; 181 int cmp, match;
184 182
@@ -294,12 +292,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
294{ 292{
295 struct event_filter *filter = call->filter; 293 struct event_filter *filter = call->filter;
296 294
297 mutex_lock(&filter_mutex); 295 mutex_lock(&event_mutex);
298 if (filter->filter_string) 296 if (filter->filter_string)
299 trace_seq_printf(s, "%s\n", filter->filter_string); 297 trace_seq_printf(s, "%s\n", filter->filter_string);
300 else 298 else
301 trace_seq_printf(s, "none\n"); 299 trace_seq_printf(s, "none\n");
302 mutex_unlock(&filter_mutex); 300 mutex_unlock(&event_mutex);
303} 301}
304 302
305void print_subsystem_event_filter(struct event_subsystem *system, 303void print_subsystem_event_filter(struct event_subsystem *system,
@@ -307,12 +305,12 @@ void print_subsystem_event_filter(struct event_subsystem *system,
307{ 305{
308 struct event_filter *filter = system->filter; 306 struct event_filter *filter = system->filter;
309 307
310 mutex_lock(&filter_mutex); 308 mutex_lock(&event_mutex);
311 if (filter->filter_string) 309 if (filter->filter_string)
312 trace_seq_printf(s, "%s\n", filter->filter_string); 310 trace_seq_printf(s, "%s\n", filter->filter_string);
313 else 311 else
314 trace_seq_printf(s, "none\n"); 312 trace_seq_printf(s, "none\n");
315 mutex_unlock(&filter_mutex); 313 mutex_unlock(&event_mutex);
316} 314}
317 315
318static struct ftrace_event_field * 316static struct ftrace_event_field *
@@ -381,6 +379,7 @@ void destroy_preds(struct ftrace_event_call *call)
381 filter_free_pred(filter->preds[i]); 379 filter_free_pred(filter->preds[i]);
382 } 380 }
383 kfree(filter->preds); 381 kfree(filter->preds);
382 kfree(filter->filter_string);
384 kfree(filter); 383 kfree(filter);
385 call->filter = NULL; 384 call->filter = NULL;
386} 385}
@@ -433,7 +432,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
433 filter->n_preds = 0; 432 filter->n_preds = 0;
434 } 433 }
435 434
436 mutex_lock(&event_mutex);
437 list_for_each_entry(call, &ftrace_events, list) { 435 list_for_each_entry(call, &ftrace_events, list) {
438 if (!call->define_fields) 436 if (!call->define_fields)
439 continue; 437 continue;
@@ -443,7 +441,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
443 remove_filter_string(call->filter); 441 remove_filter_string(call->filter);
444 } 442 }
445 } 443 }
446 mutex_unlock(&event_mutex);
447} 444}
448 445
449static int filter_add_pred_fn(struct filter_parse_state *ps, 446static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -546,6 +543,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
546 filter_pred_fn_t fn; 543 filter_pred_fn_t fn;
547 unsigned long long val; 544 unsigned long long val;
548 int string_type; 545 int string_type;
546 int ret;
549 547
550 pred->fn = filter_pred_none; 548 pred->fn = filter_pred_none;
551 549
@@ -581,7 +579,11 @@ static int filter_add_pred(struct filter_parse_state *ps,
581 pred->not = 1; 579 pred->not = 1;
582 return filter_add_pred_fn(ps, call, pred, fn); 580 return filter_add_pred_fn(ps, call, pred, fn);
583 } else { 581 } else {
584 if (strict_strtoull(pred->str_val, 0, &val)) { 582 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val);
584 else
585 ret = strict_strtoull(pred->str_val, 0, &val);
586 if (ret) {
585 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 587 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
586 return -EINVAL; 588 return -EINVAL;
587 } 589 }
@@ -625,7 +627,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
625 filter->preds[filter->n_preds] = pred; 627 filter->preds[filter->n_preds] = pred;
626 filter->n_preds++; 628 filter->n_preds++;
627 629
628 mutex_lock(&event_mutex);
629 list_for_each_entry(call, &ftrace_events, list) { 630 list_for_each_entry(call, &ftrace_events, list) {
630 631
631 if (!call->define_fields) 632 if (!call->define_fields)
@@ -636,14 +637,12 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
636 637
637 err = filter_add_pred(ps, call, pred); 638 err = filter_add_pred(ps, call, pred);
638 if (err) { 639 if (err) {
639 mutex_unlock(&event_mutex);
640 filter_free_subsystem_preds(system); 640 filter_free_subsystem_preds(system);
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
642 goto out; 642 goto out;
643 } 643 }
644 replace_filter_string(call->filter, filter_string); 644 replace_filter_string(call->filter, filter_string);
645 } 645 }
646 mutex_unlock(&event_mutex);
647out: 646out:
648 return err; 647 return err;
649} 648}
@@ -1070,12 +1069,12 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1070 1069
1071 struct filter_parse_state *ps; 1070 struct filter_parse_state *ps;
1072 1071
1073 mutex_lock(&filter_mutex); 1072 mutex_lock(&event_mutex);
1074 1073
1075 if (!strcmp(strstrip(filter_string), "0")) { 1074 if (!strcmp(strstrip(filter_string), "0")) {
1076 filter_disable_preds(call); 1075 filter_disable_preds(call);
1077 remove_filter_string(call->filter); 1076 remove_filter_string(call->filter);
1078 mutex_unlock(&filter_mutex); 1077 mutex_unlock(&event_mutex);
1079 return 0; 1078 return 0;
1080 } 1079 }
1081 1080
@@ -1103,7 +1102,7 @@ out:
1103 postfix_clear(ps); 1102 postfix_clear(ps);
1104 kfree(ps); 1103 kfree(ps);
1105out_unlock: 1104out_unlock:
1106 mutex_unlock(&filter_mutex); 1105 mutex_unlock(&event_mutex);
1107 1106
1108 return err; 1107 return err;
1109} 1108}
@@ -1115,12 +1114,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1115 1114
1116 struct filter_parse_state *ps; 1115 struct filter_parse_state *ps;
1117 1116
1118 mutex_lock(&filter_mutex); 1117 mutex_lock(&event_mutex);
1119 1118
1120 if (!strcmp(strstrip(filter_string), "0")) { 1119 if (!strcmp(strstrip(filter_string), "0")) {
1121 filter_free_subsystem_preds(system); 1120 filter_free_subsystem_preds(system);
1122 remove_filter_string(system->filter); 1121 remove_filter_string(system->filter);
1123 mutex_unlock(&filter_mutex); 1122 mutex_unlock(&event_mutex);
1124 return 0; 1123 return 0;
1125 } 1124 }
1126 1125
@@ -1148,7 +1147,7 @@ out:
1148 postfix_clear(ps); 1147 postfix_clear(ps);
1149 kfree(ps); 1148 kfree(ps);
1150out_unlock: 1149out_unlock:
1151 mutex_unlock(&filter_mutex); 1150 mutex_unlock(&event_mutex);
1152 1151
1153 return err; 1152 return err;
1154} 1153}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..7402144bff21 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
193static void tracing_stop_function_trace(void) 193static void tracing_stop_function_trace(void)
194{ 194{
195 ftrace_function_enabled = 0; 195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */ 196
197 unregister_ftrace_function(&trace_stack_ops); 197 if (func_flags.val & TRACE_FUNC_OPT_STACK)
198 unregister_ftrace_function(&trace_ops); 198 unregister_ftrace_function(&trace_stack_ops);
199 else
200 unregister_ftrace_function(&trace_ops);
199} 201}
200 202
201static int func_set_flag(u32 old_flags, u32 bit, int set) 203static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
300 if (count == -1) 302 if (count == -1)
301 seq_printf(m, ":unlimited\n"); 303 seq_printf(m, ":unlimited\n");
302 else 304 else
303 seq_printf(m, ":count=%ld", count); 305 seq_printf(m, ":count=%ld\n", count);
304 seq_putc(m, '\n');
305 306
306 return 0; 307 return 0;
307} 308}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b2..d2249abafb53 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
59int 59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
61 unsigned long frame_pointer)
61{ 62{
62 unsigned long long calltime; 63 unsigned long long calltime;
63 int index; 64 int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
85 current->ret_stack[index].func = func; 86 current->ret_stack[index].func = func;
86 current->ret_stack[index].calltime = calltime; 87 current->ret_stack[index].calltime = calltime;
87 current->ret_stack[index].subtime = 0; 88 current->ret_stack[index].subtime = 0;
89 current->ret_stack[index].fp = frame_pointer;
88 *depth = index; 90 *depth = index;
89 91
90 return 0; 92 return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
92 94
93/* Retrieve a function return address to the trace stack on thread info.*/ 95/* Retrieve a function return address to the trace stack on thread info.*/
94static void 96static void
95ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 97ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
98 unsigned long frame_pointer)
96{ 99{
97 int index; 100 int index;
98 101
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
106 return; 109 return;
107 } 110 }
108 111
112#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
113 /*
114 * The arch may choose to record the frame pointer used
115 * and check it here to make sure that it is what we expect it
116 * to be. If gcc does not set the place holder of the return
117 * address in the frame pointer, and does a copy instead, then
118 * the function graph trace will fail. This test detects this
119 * case.
120 *
121 * Currently, x86_32 with optimize for size (-Os) makes the latest
122 * gcc do the above.
123 */
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n",
128 current->ret_stack[index].fp,
129 frame_pointer,
130 (void *)current->ret_stack[index].func,
131 current->ret_stack[index].ret);
132 *ret = (unsigned long)panic;
133 return;
134 }
135#endif
136
109 *ret = current->ret_stack[index].ret; 137 *ret = current->ret_stack[index].ret;
110 trace->func = current->ret_stack[index].func; 138 trace->func = current->ret_stack[index].func;
111 trace->calltime = current->ret_stack[index].calltime; 139 trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
117 * Send the trace to the ring-buffer. 145 * Send the trace to the ring-buffer.
118 * @return the original return address. 146 * @return the original return address.
119 */ 147 */
120unsigned long ftrace_return_to_handler(void) 148unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
121{ 149{
122 struct ftrace_graph_ret trace; 150 struct ftrace_graph_ret trace;
123 unsigned long ret; 151 unsigned long ret;
124 152
125 ftrace_pop_return_trace(&trace, &ret); 153 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
126 trace.rettime = trace_clock_local(); 154 trace.rettime = trace_clock_local();
127 ftrace_graph_return(&trace); 155 ftrace_graph_return(&trace);
128 barrier(); 156 barrier();
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..7b6278110827 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..e644af910124 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -326,10 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 327
328 if (ret || !write || 328 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 329 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 330 goto out;
331 331
332 last_stack_tracer_enabled = stack_tracer_enabled; 332 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 333
334 if (stack_tracer_enabled) 334 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 335 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..e66f5e493342 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
199 mutex_lock(&session->stat_mutex); 199 mutex_lock(&session->stat_mutex);
200 200
201 /* If we are in the beginning of the file, print the headers */ 201 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 202 if (!*pos && session->ts->stat_headers)
203 (*pos)++;
204 return SEQ_START_TOKEN; 203 return SEQ_START_TOKEN;
205 }
206 204
207 node = rb_first(&session->stat_root); 205 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 206 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node); 207 node = rb_next(node);
210 208
211 (*pos)++;
212
213 return node; 209 return node;
214} 210}
215 211
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af8..8a82b4b8ea52 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18static struct uts_namespace *create_uts_ns(void)
19{
20 struct uts_namespace *uts_ns;
21
22 uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
23 if (uts_ns)
24 kref_init(&uts_ns->kref);
25 return uts_ns;
26}
27
18/* 28/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 29 * Clone a new ns copying an original utsname, setting refcount to 1
20 * @old_ns: namespace to clone 30 * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24{ 34{
25 struct uts_namespace *ns; 35 struct uts_namespace *ns;
26 36
27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 37 ns = create_uts_ns();
28 if (!ns) 38 if (!ns)
29 return ERR_PTR(-ENOMEM); 39 return ERR_PTR(-ENOMEM);
30 40
31 down_read(&uts_sem); 41 down_read(&uts_sem);
32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem); 43 up_read(&uts_sem);
34 kref_init(&ns->kref);
35 return ns; 44 return ns;
36} 45}
37 46