aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2009-06-29 19:10:35 -0400
committerJames Morris <jmorris@namei.org>2009-06-29 19:10:35 -0400
commitac7242142b03421c96b0a2f8d99f146d075614c2 (patch)
treeb0b2ead65858c7a343d38affed86fe815e37e7e9 /kernel
parent89c86576ecde504da1eeb4f4882b2189ac2f9c4a (diff)
parent2bfdd79eaa0043346e773ba5f6cfd811ea31b73d (diff)
Merge branch 'master' into next
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c66
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/futex.c45
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/perf_counter.c312
-rw-r--r--kernel/sched.c5
-rw-r--r--kernel/sched_cpupri.c2
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c3
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/ftrace.c63
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/ring_buffer.c322
-rw-r--r--kernel/trace/ring_buffer_benchmark.c45
-rw-r--r--kernel/trace/trace.c31
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_events.c28
-rw-r--r--kernel/trace/trace_events_filter.c37
-rw-r--r--kernel/trace/trace_functions.c11
-rw-r--r--kernel/trace/trace_functions_graph.c36
-rw-r--r--kernel/trace/trace_printk.c26
-rw-r--r--kernel/trace/trace_stat.c6
33 files changed, 1362 insertions, 1050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0a32cb21ec97..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/ 74obj-$(CONFIG_GCOV_KERNEL) += gcov/
75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/
99obj-$(CONFIG_SMP) += sched_cpupri.o 100obj-$(CONFIG_SMP) += sched_cpupri.o
100obj-$(CONFIG_SLOW_WORK) += slow-work.o 101obj-$(CONFIG_SLOW_WORK) += slow-work.o
101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
34 * an ongoing cpu hotplug operation. 34 * an ongoing cpu hotplug operation.
35 */ 35 */
36 int refcount; 36 int refcount;
37} cpu_hotplug; 37} cpu_hotplug = {
38 38 .active_writer = NULL,
39void __init cpu_hotplug_init(void) 39 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
40{ 40 .refcount = 0,
41 cpu_hotplug.active_writer = NULL; 41};
42 mutex_init(&cpu_hotplug.lock);
43 cpu_hotplug.refcount = 0;
44}
45 42
46#ifdef CONFIG_HOTPLUG_CPU 43#ifdef CONFIG_HOTPLUG_CPU
47 44
diff --git a/kernel/exit.c b/kernel/exit.c
index 13ae64001fec..628d41f0dd54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1197,8 +1197,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1197 } 1197 }
1198 1198
1199 traced = ptrace_reparented(p); 1199 traced = ptrace_reparented(p);
1200 1200 /*
1201 if (likely(!traced)) { 1201 * It can be ptraced but not reparented, check
1202 * !task_detached() to filter out sub-threads.
1203 */
1204 if (likely(!traced) && likely(!task_detached(p))) {
1202 struct signal_struct *psig; 1205 struct signal_struct *psig;
1203 struct signal_struct *sig; 1206 struct signal_struct *sig;
1204 1207
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..794c862125fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
284 drop_futex_key_refs(key); 284 drop_futex_key_refs(key);
285} 285}
286 286
287/*
288 * fault_in_user_writeable - fault in user address and verify RW access
289 * @uaddr: pointer to faulting user space address
290 *
291 * Slow path to fixup the fault we just took in the atomic write
292 * access to @uaddr.
293 *
294 * We have no generic implementation of a non destructive write to the
295 * user address. We know that we faulted in the atomic pagefault
296 * disabled section so we can as well avoid the #PF overhead by
297 * calling get_user_pages() right away.
298 */
299static int fault_in_user_writeable(u32 __user *uaddr)
300{
301 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
302 1, 1, 0, NULL, NULL);
303 return ret < 0 ? ret : 0;
304}
305
287/** 306/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex 307 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in 308 * @hb: the hash bucket the futex_q's reside in
@@ -896,7 +915,6 @@ retry:
896retry_private: 915retry_private:
897 op_ret = futex_atomic_op_inuser(op, uaddr2); 916 op_ret = futex_atomic_op_inuser(op, uaddr2);
898 if (unlikely(op_ret < 0)) { 917 if (unlikely(op_ret < 0)) {
899 u32 dummy;
900 918
901 double_unlock_hb(hb1, hb2); 919 double_unlock_hb(hb1, hb2);
902 920
@@ -914,7 +932,7 @@ retry_private:
914 goto out_put_keys; 932 goto out_put_keys;
915 } 933 }
916 934
917 ret = get_user(dummy, uaddr2); 935 ret = fault_in_user_writeable(uaddr2);
918 if (ret) 936 if (ret)
919 goto out_put_keys; 937 goto out_put_keys;
920 938
@@ -1204,7 +1222,7 @@ retry_private:
1204 double_unlock_hb(hb1, hb2); 1222 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2); 1223 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1); 1224 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2); 1225 ret = fault_in_user_writeable(uaddr2);
1208 if (!ret) 1226 if (!ret)
1209 goto retry; 1227 goto retry;
1210 goto out; 1228 goto out;
@@ -1482,7 +1500,7 @@ retry:
1482handle_fault: 1500handle_fault:
1483 spin_unlock(q->lock_ptr); 1501 spin_unlock(q->lock_ptr);
1484 1502
1485 ret = get_user(uval, uaddr); 1503 ret = fault_in_user_writeable(uaddr);
1486 1504
1487 spin_lock(q->lock_ptr); 1505 spin_lock(q->lock_ptr);
1488 1506
@@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1807{ 1825{
1808 struct hrtimer_sleeper timeout, *to = NULL; 1826 struct hrtimer_sleeper timeout, *to = NULL;
1809 struct futex_hash_bucket *hb; 1827 struct futex_hash_bucket *hb;
1810 u32 uval;
1811 struct futex_q q; 1828 struct futex_q q;
1812 int res, ret; 1829 int res, ret;
1813 1830
@@ -1909,16 +1926,9 @@ out:
1909 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1926 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910 1927
1911uaddr_faulted: 1928uaddr_faulted:
1912 /*
1913 * We have to r/w *(int __user *)uaddr, and we have to modify it
1914 * atomically. Therefore, if we continue to fault after get_user()
1915 * below, we need to handle the fault ourselves, while still holding
1916 * the mmap_sem. This can occur if the uaddr is under contention as
1917 * we have to drop the mmap_sem in order to call get_user().
1918 */
1919 queue_unlock(&q, hb); 1929 queue_unlock(&q, hb);
1920 1930
1921 ret = get_user(uval, uaddr); 1931 ret = fault_in_user_writeable(uaddr);
1922 if (ret) 1932 if (ret)
1923 goto out_put_key; 1933 goto out_put_key;
1924 1934
@@ -2013,17 +2023,10 @@ out:
2013 return ret; 2023 return ret;
2014 2024
2015pi_faulted: 2025pi_faulted:
2016 /*
2017 * We have to r/w *(int __user *)uaddr, and we have to modify it
2018 * atomically. Therefore, if we continue to fault after get_user()
2019 * below, we need to handle the fault ourselves, while still holding
2020 * the mmap_sem. This can occur if the uaddr is under contention as
2021 * we have to drop the mmap_sem in order to call get_user().
2022 */
2023 spin_unlock(&hb->lock); 2026 spin_unlock(&hb->lock);
2024 put_futex_key(fshared, &key); 2027 put_futex_key(fshared, &key);
2025 2028
2026 ret = get_user(uval, uaddr); 2029 ret = fault_in_user_writeable(uaddr);
2027 if (!ret) 2030 if (!ret)
2028 goto retry; 2031 goto retry;
2029 2032
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aaf5c9d05770..50da67672901 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -856,7 +856,7 @@ EXPORT_SYMBOL(free_irq);
856 * still called in hard interrupt context and has to check 856 * still called in hard interrupt context and has to check
857 * whether the interrupt originates from the device. If yes it 857 * whether the interrupt originates from the device. If yes it
858 * needs to disable the interrupt on the device and return 858 * needs to disable the interrupt on the device and return
859 * IRQ_THREAD_WAKE which will wake up the handler thread and run 859 * IRQ_WAKE_THREAD which will wake up the handler thread and run
860 * @thread_fn. This split handler design is necessary to support 860 * @thread_fn. This split handler design is necessary to support
861 * shared interrupts. 861 * shared interrupts.
862 * 862 *
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..1a933a221ea4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)
124 124
125static void get_ctx(struct perf_counter_context *ctx) 125static void get_ctx(struct perf_counter_context *ctx)
126{ 126{
127 atomic_inc(&ctx->refcount); 127 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128} 128}
129 129
130static void free_ctx(struct rcu_head *head) 130static void free_ctx(struct rcu_head *head)
@@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
175 spin_unlock_irqrestore(&ctx->lock, *flags); 175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry; 176 goto retry;
177 } 177 }
178
179 if (!atomic_inc_not_zero(&ctx->refcount)) {
180 spin_unlock_irqrestore(&ctx->lock, *flags);
181 ctx = NULL;
182 }
178 } 183 }
179 rcu_read_unlock(); 184 rcu_read_unlock();
180 return ctx; 185 return ctx;
@@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
193 ctx = perf_lock_task_context(task, &flags); 198 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) { 199 if (ctx) {
195 ++ctx->pin_count; 200 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags); 201 spin_unlock_irqrestore(&ctx->lock, flags);
198 } 202 }
199 return ctx; 203 return ctx;
@@ -1283,7 +1287,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1283 if (!interrupts) { 1287 if (!interrupts) {
1284 perf_disable(); 1288 perf_disable();
1285 counter->pmu->disable(counter); 1289 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0); 1290 atomic64_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter); 1291 counter->pmu->enable(counter);
1288 perf_enable(); 1292 perf_enable();
1289 } 1293 }
@@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1459 put_ctx(parent_ctx); 1463 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */ 1464 ctx->parent_ctx = NULL; /* no longer a clone */
1461 } 1465 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags); 1466 spin_unlock_irqrestore(&ctx->lock, flags);
1468 } 1467 }
1469 1468
@@ -1553,7 +1552,7 @@ static int perf_release(struct inode *inode, struct file *file)
1553static ssize_t 1552static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1553perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{ 1554{
1556 u64 values[3]; 1555 u64 values[4];
1557 int n; 1556 int n;
1558 1557
1559 /* 1558 /*
@@ -1620,22 +1619,6 @@ static void perf_counter_reset(struct perf_counter *counter)
1620 perf_counter_update_userpage(counter); 1619 perf_counter_update_userpage(counter);
1621} 1620}
1622 1621
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/* 1622/*
1640 * Holding the top-level counter's child_mutex means that any 1623 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block 1624 * descendant process that has inherited this counter will block
@@ -1658,14 +1641,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
1658static void perf_counter_for_each(struct perf_counter *counter, 1641static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *)) 1642 void (*func)(struct perf_counter *))
1660{ 1643{
1661 struct perf_counter *child; 1644 struct perf_counter_context *ctx = counter->ctx;
1645 struct perf_counter *sibling;
1662 1646
1663 WARN_ON_ONCE(counter->ctx->parent_ctx); 1647 WARN_ON_ONCE(ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex); 1648 mutex_lock(&ctx->mutex);
1665 perf_counter_for_each_sibling(counter, func); 1649 counter = counter->group_leader;
1666 list_for_each_entry(child, &counter->child_list, child_list) 1650
1667 perf_counter_for_each_sibling(child, func); 1651 perf_counter_for_each_child(counter, func);
1668 mutex_unlock(&counter->child_mutex); 1652 func(counter);
1653 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1654 perf_counter_for_each_child(counter, func);
1655 mutex_unlock(&ctx->mutex);
1669} 1656}
1670 1657
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) 1658static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1806,6 +1793,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1806 struct perf_mmap_data *data; 1793 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS; 1794 int ret = VM_FAULT_SIGBUS;
1808 1795
1796 if (vmf->flags & FAULT_FLAG_MKWRITE) {
1797 if (vmf->pgoff == 0)
1798 ret = 0;
1799 return ret;
1800 }
1801
1809 rcu_read_lock(); 1802 rcu_read_lock();
1810 data = rcu_dereference(counter->data); 1803 data = rcu_dereference(counter->data);
1811 if (!data) 1804 if (!data)
@@ -1819,9 +1812,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1819 if ((unsigned)nr > data->nr_pages) 1812 if ((unsigned)nr > data->nr_pages)
1820 goto unlock; 1813 goto unlock;
1821 1814
1815 if (vmf->flags & FAULT_FLAG_WRITE)
1816 goto unlock;
1817
1822 vmf->page = virt_to_page(data->data_pages[nr]); 1818 vmf->page = virt_to_page(data->data_pages[nr]);
1823 } 1819 }
1820
1824 get_page(vmf->page); 1821 get_page(vmf->page);
1822 vmf->page->mapping = vma->vm_file->f_mapping;
1823 vmf->page->index = vmf->pgoff;
1824
1825 ret = 0; 1825 ret = 0;
1826unlock: 1826unlock:
1827 rcu_read_unlock(); 1827 rcu_read_unlock();
@@ -1874,6 +1874,14 @@ fail:
1874 return -ENOMEM; 1874 return -ENOMEM;
1875} 1875}
1876 1876
1877static void perf_mmap_free_page(unsigned long addr)
1878{
1879 struct page *page = virt_to_page(addr);
1880
1881 page->mapping = NULL;
1882 __free_page(page);
1883}
1884
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head) 1885static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{ 1886{
1879 struct perf_mmap_data *data; 1887 struct perf_mmap_data *data;
@@ -1881,9 +1889,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1881 1889
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 1890 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883 1891
1884 free_page((unsigned long)data->user_page); 1892 perf_mmap_free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++) 1893 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]); 1894 perf_mmap_free_page((unsigned long)data->data_pages[i]);
1895
1887 kfree(data); 1896 kfree(data);
1888} 1897}
1889 1898
@@ -1920,9 +1929,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1920} 1929}
1921 1930
1922static struct vm_operations_struct perf_mmap_vmops = { 1931static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open, 1932 .open = perf_mmap_open,
1924 .close = perf_mmap_close, 1933 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault, 1934 .fault = perf_mmap_fault,
1935 .page_mkwrite = perf_mmap_fault,
1926}; 1936};
1927 1937
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma) 1938static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +1946,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1936 long user_extra, extra; 1946 long user_extra, extra;
1937 int ret = 0; 1947 int ret = 0;
1938 1948
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 1949 if (!(vma->vm_flags & VM_SHARED))
1940 return -EINVAL; 1950 return -EINVAL;
1941 1951
1942 vma_size = vma->vm_end - vma->vm_start; 1952 vma_size = vma->vm_end - vma->vm_start;
@@ -1995,10 +2005,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1995 atomic_long_add(user_extra, &user->locked_vm); 2005 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra; 2006 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra; 2007 counter->data->nr_locked = extra;
2008 if (vma->vm_flags & VM_WRITE)
2009 counter->data->writable = 1;
2010
1998unlock: 2011unlock:
1999 mutex_unlock(&counter->mmap_mutex); 2012 mutex_unlock(&counter->mmap_mutex);
2000 2013
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED; 2014 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops; 2015 vma->vm_ops = &perf_mmap_vmops;
2004 2016
@@ -2175,11 +2187,38 @@ struct perf_output_handle {
2175 unsigned long head; 2187 unsigned long head;
2176 unsigned long offset; 2188 unsigned long offset;
2177 int nmi; 2189 int nmi;
2178 int overflow; 2190 int sample;
2179 int locked; 2191 int locked;
2180 unsigned long flags; 2192 unsigned long flags;
2181}; 2193};
2182 2194
2195static bool perf_output_space(struct perf_mmap_data *data,
2196 unsigned int offset, unsigned int head)
2197{
2198 unsigned long tail;
2199 unsigned long mask;
2200
2201 if (!data->writable)
2202 return true;
2203
2204 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2205 /*
2206 * Userspace could choose to issue a mb() before updating the tail
2207 * pointer. So that all reads will be completed before the write is
2208 * issued.
2209 */
2210 tail = ACCESS_ONCE(data->user_page->data_tail);
2211 smp_rmb();
2212
2213 offset = (offset - tail) & mask;
2214 head = (head - tail) & mask;
2215
2216 if ((int)(head - offset) < 0)
2217 return false;
2218
2219 return true;
2220}
2221
2183static void perf_output_wakeup(struct perf_output_handle *handle) 2222static void perf_output_wakeup(struct perf_output_handle *handle)
2184{ 2223{
2185 atomic_set(&handle->data->poll, POLL_IN); 2224 atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,12 +2309,57 @@ out:
2270 local_irq_restore(handle->flags); 2309 local_irq_restore(handle->flags);
2271} 2310}
2272 2311
2312static void perf_output_copy(struct perf_output_handle *handle,
2313 const void *buf, unsigned int len)
2314{
2315 unsigned int pages_mask;
2316 unsigned int offset;
2317 unsigned int size;
2318 void **pages;
2319
2320 offset = handle->offset;
2321 pages_mask = handle->data->nr_pages - 1;
2322 pages = handle->data->data_pages;
2323
2324 do {
2325 unsigned int page_offset;
2326 int nr;
2327
2328 nr = (offset >> PAGE_SHIFT) & pages_mask;
2329 page_offset = offset & (PAGE_SIZE - 1);
2330 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2331
2332 memcpy(pages[nr] + page_offset, buf, size);
2333
2334 len -= size;
2335 buf += size;
2336 offset += size;
2337 } while (len);
2338
2339 handle->offset = offset;
2340
2341 /*
2342 * Check we didn't copy past our reservation window, taking the
2343 * possible unsigned int wrap into account.
2344 */
2345 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2346}
2347
2348#define perf_output_put(handle, x) \
2349 perf_output_copy((handle), &(x), sizeof(x))
2350
2273static int perf_output_begin(struct perf_output_handle *handle, 2351static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size, 2352 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow) 2353 int nmi, int sample)
2276{ 2354{
2277 struct perf_mmap_data *data; 2355 struct perf_mmap_data *data;
2278 unsigned int offset, head; 2356 unsigned int offset, head;
2357 int have_lost;
2358 struct {
2359 struct perf_event_header header;
2360 u64 id;
2361 u64 lost;
2362 } lost_event;
2279 2363
2280 /* 2364 /*
2281 * For inherited counters we send all the output towards the parent. 2365 * For inherited counters we send all the output towards the parent.
@@ -2288,19 +2372,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
2288 if (!data) 2372 if (!data)
2289 goto out; 2373 goto out;
2290 2374
2291 handle->data = data; 2375 handle->data = data;
2292 handle->counter = counter; 2376 handle->counter = counter;
2293 handle->nmi = nmi; 2377 handle->nmi = nmi;
2294 handle->overflow = overflow; 2378 handle->sample = sample;
2295 2379
2296 if (!data->nr_pages) 2380 if (!data->nr_pages)
2297 goto fail; 2381 goto fail;
2298 2382
2383 have_lost = atomic_read(&data->lost);
2384 if (have_lost)
2385 size += sizeof(lost_event);
2386
2299 perf_output_lock(handle); 2387 perf_output_lock(handle);
2300 2388
2301 do { 2389 do {
2302 offset = head = atomic_long_read(&data->head); 2390 offset = head = atomic_long_read(&data->head);
2303 head += size; 2391 head += size;
2392 if (unlikely(!perf_output_space(data, offset, head)))
2393 goto fail;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2394 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305 2395
2306 handle->offset = offset; 2396 handle->offset = offset;
@@ -2309,55 +2399,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2399 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1); 2400 atomic_set(&data->wakeup, 1);
2311 2401
2402 if (have_lost) {
2403 lost_event.header.type = PERF_EVENT_LOST;
2404 lost_event.header.misc = 0;
2405 lost_event.header.size = sizeof(lost_event);
2406 lost_event.id = counter->id;
2407 lost_event.lost = atomic_xchg(&data->lost, 0);
2408
2409 perf_output_put(handle, lost_event);
2410 }
2411
2312 return 0; 2412 return 0;
2313 2413
2314fail: 2414fail:
2315 perf_output_wakeup(handle); 2415 atomic_inc(&data->lost);
2416 perf_output_unlock(handle);
2316out: 2417out:
2317 rcu_read_unlock(); 2418 rcu_read_unlock();
2318 2419
2319 return -ENOSPC; 2420 return -ENOSPC;
2320} 2421}
2321 2422
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle) 2423static void perf_output_end(struct perf_output_handle *handle)
2362{ 2424{
2363 struct perf_counter *counter = handle->counter; 2425 struct perf_counter *counter = handle->counter;
@@ -2365,7 +2427,7 @@ static void perf_output_end(struct perf_output_handle *handle)
2365 2427
2366 int wakeup_events = counter->attr.wakeup_events; 2428 int wakeup_events = counter->attr.wakeup_events;
2367 2429
2368 if (handle->overflow && wakeup_events) { 2430 if (handle->sample && wakeup_events) {
2369 int events = atomic_inc_return(&data->events); 2431 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) { 2432 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events); 2433 atomic_sub(wakeup_events, &data->events);
@@ -2970,7 +3032,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2970} 3032}
2971 3033
2972/* 3034/*
2973 * Generic counter overflow handling. 3035 * Generic counter overflow handling, sampling.
2974 */ 3036 */
2975 3037
2976int perf_counter_overflow(struct perf_counter *counter, int nmi, 3038int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3109,20 +3171,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3109} 3171}
3110 3172
3111static void perf_swcounter_overflow(struct perf_counter *counter, 3173static void perf_swcounter_overflow(struct perf_counter *counter,
3112 int nmi, struct pt_regs *regs, u64 addr) 3174 int nmi, struct perf_sample_data *data)
3113{ 3175{
3114 struct perf_sample_data data = { 3176 data->period = counter->hw.last_period;
3115 .regs = regs,
3116 .addr = addr,
3117 .period = counter->hw.last_period,
3118 };
3119 3177
3120 perf_swcounter_update(counter); 3178 perf_swcounter_update(counter);
3121 perf_swcounter_set_period(counter); 3179 perf_swcounter_set_period(counter);
3122 if (perf_counter_overflow(counter, nmi, &data)) 3180 if (perf_counter_overflow(counter, nmi, data))
3123 /* soft-disable the counter */ 3181 /* soft-disable the counter */
3124 ; 3182 ;
3125
3126} 3183}
3127 3184
3128static int perf_swcounter_is_counting(struct perf_counter *counter) 3185static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3187,18 +3244,18 @@ static int perf_swcounter_match(struct perf_counter *counter,
3187} 3244}
3188 3245
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr, 3246static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr) 3247 int nmi, struct perf_sample_data *data)
3191{ 3248{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count); 3249 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193 3250
3194 if (counter->hw.sample_period && !neg && regs) 3251 if (counter->hw.sample_period && !neg && data->regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr); 3252 perf_swcounter_overflow(counter, nmi, data);
3196} 3253}
3197 3254
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3255static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event, 3256 enum perf_type_id type,
3200 u64 nr, int nmi, struct pt_regs *regs, 3257 u32 event, u64 nr, int nmi,
3201 u64 addr) 3258 struct perf_sample_data *data)
3202{ 3259{
3203 struct perf_counter *counter; 3260 struct perf_counter *counter;
3204 3261
@@ -3207,8 +3264,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3207 3264
3208 rcu_read_lock(); 3265 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3266 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs)) 3267 if (perf_swcounter_match(counter, type, event, data->regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr); 3268 perf_swcounter_add(counter, nr, nmi, data);
3212 } 3269 }
3213 rcu_read_unlock(); 3270 rcu_read_unlock();
3214} 3271}
@@ -3227,9 +3284,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3227 return &cpuctx->recursion[0]; 3284 return &cpuctx->recursion[0];
3228} 3285}
3229 3286
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event, 3287static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs, 3288 u64 nr, int nmi,
3232 u64 addr) 3289 struct perf_sample_data *data)
3233{ 3290{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3291 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx); 3292 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3299,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3242 barrier(); 3299 barrier();
3243 3300
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3301 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr); 3302 nr, nmi, data);
3246 rcu_read_lock(); 3303 rcu_read_lock();
3247 /* 3304 /*
3248 * doesn't really matter which of the child contexts the 3305 * doesn't really matter which of the child contexts the
@@ -3250,7 +3307,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3250 */ 3307 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp); 3308 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx) 3309 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); 3310 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3254 rcu_read_unlock(); 3311 rcu_read_unlock();
3255 3312
3256 barrier(); 3313 barrier();
@@ -3263,7 +3320,12 @@ out:
3263void 3320void
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3321perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3265{ 3322{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); 3323 struct perf_sample_data data = {
3324 .regs = regs,
3325 .addr = addr,
3326 };
3327
3328 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3267} 3329}
3268 3330
3269static void perf_swcounter_read(struct perf_counter *counter) 3331static void perf_swcounter_read(struct perf_counter *counter)
@@ -3404,36 +3466,18 @@ static const struct pmu perf_ops_task_clock = {
3404 .read = task_clock_perf_counter_read, 3466 .read = task_clock_perf_counter_read,
3405}; 3467};
3406 3468
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE 3469#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id) 3470void perf_tpcounter_event(int event_id)
3430{ 3471{
3431 struct pt_regs *regs = get_irq_regs(); 3472 struct perf_sample_data data = {
3473 .regs = get_irq_regs();
3474 .addr = 0,
3475 };
3432 3476
3433 if (!regs) 3477 if (!data.regs)
3434 regs = task_pt_regs(current); 3478 data.regs = task_pt_regs(current);
3435 3479
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); 3480 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
3437} 3481}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3482EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439 3483
diff --git a/kernel/sched.c b/kernel/sched.c
index 247fd0fedd0b..7c9098d186e6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1978 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1979 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1980#endif 1980#endif
1981 perf_counter_task_migration(p, new_cpu); 1981 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1982 1, 1, NULL, 0);
1982 } 1983 }
1983 p->se.vruntime -= old_cfsrq->min_vruntime - 1984 p->se.vruntime -= old_cfsrq->min_vruntime -
1984 new_cfsrq->min_vruntime; 1985 new_cfsrq->min_vruntime;
@@ -7822,7 +7823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7822 free_rootdomain(old_rd); 7823 free_rootdomain(old_rd);
7823} 7824}
7824 7825
7825static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7826static int init_rootdomain(struct root_domain *rd, bool bootmem)
7826{ 7827{
7827 gfp_t gfp = GFP_KERNEL; 7828 gfp_t gfp = GFP_KERNEL;
7828 7829
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7deffc9f0e5f..e6c251790dde 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
152 * 152 *
153 * Returns: -ENOMEM if memory fails. 153 * Returns: -ENOMEM if memory fails.
154 */ 154 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 155int cpupri_init(struct cpupri *cp, bool bootmem)
156{ 156{
157 gfp_t gfp = GFP_KERNEL; 157 gfp_t gfp = GFP_KERNEL;
158 int i; 158 int i;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162{ 162{
163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
164 spread, rq0_min_vruntime, spread0; 164 spread, rq0_min_vruntime, spread0;
165 struct rq *rq = &per_cpu(runqueues, cpu); 165 struct rq *rq = cpu_rq(cpu);
166 struct sched_entity *last; 166 struct sched_entity *last;
167 unsigned long flags; 167 unsigned long flags;
168 168
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 if (last) 191 if (last)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
248 248
249static void print_cpu(struct seq_file *m, int cpu) 249static void print_cpu(struct seq_file *m, int cpu)
250{ 250{
251 struct rq *rq = &per_cpu(runqueues, cpu); 251 struct rq *rq = cpu_rq(cpu);
252 252
253#ifdef CONFIG_X86 253#ifdef CONFIG_X86
254 { 254 {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f9650e8fe75..ba7fd6e9556f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
430 430
431 for_each_sched_entity(se) { 431 for_each_sched_entity(se) {
432 struct load_weight *load; 432 struct load_weight *load;
433 struct load_weight lw;
433 434
434 cfs_rq = cfs_rq_of(se); 435 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load; 436 load = &cfs_rq->load;
436 437
437 if (unlikely(!se->on_rq)) { 438 if (unlikely(!se->on_rq)) {
438 struct load_weight lw = cfs_rq->load; 439 lw = cfs_rq->load;
439 440
440 update_load_add(&lw, se->load.weight); 441 update_load_add(&lw, se->load.weight);
441 load = &lw; 442 load = &lw;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..98e02328c67d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
335 .data = &sysctl_timer_migration, 335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 336 .maxlen = sizeof(unsigned int),
337 .mode = 0644, 337 .mode = 0644,
338 .proc_handler = &proc_dointvec, 338 .proc_handler = &proc_dointvec_minmax,
339 .strategy = &sysctl_intvec,
340 .extra1 = &zero,
341 .extra2 = &one,
339 }, 342 },
340#endif 343#endif
341 { 344 {
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
744 .proc_handler = &proc_dointvec, 747 .proc_handler = &proc_dointvec,
745 }, 748 },
746 { 749 {
750 .ctl_name = CTL_UNNUMBERED,
751 .procname = "panic_on_io_nmi",
752 .data = &panic_on_io_nmi,
753 .maxlen = sizeof(int),
754 .mode = 0644,
755 .proc_handler = &proc_dointvec,
756 },
757 {
747 .ctl_name = KERN_BOOTLOADER_TYPE, 758 .ctl_name = KERN_BOOTLOADER_TYPE,
748 .procname = "bootloader_type", 759 .procname = "bootloader_type",
749 .data = &bootloader_type, 760 .data = &bootloader_type,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2aff39c6f10c..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
222 222
223 cpu = smp_processor_id(); 223 cpu = smp_processor_id();
224 ts = &per_cpu(tick_cpu_sched, cpu); 224 ts = &per_cpu(tick_cpu_sched, cpu);
225
226 /*
227 * Call to tick_nohz_start_idle stops the last_update_time from being
228 * updated. Thus, it must not be called in the event we are called from
229 * irq_exit() with the prior state different than idle.
230 */
231 if (!inidle && !ts->inidle)
232 goto end;
233
225 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
226 235
227 /* 236 /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
240 goto end; 249 goto end;
241 250
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1; 251 ts->inidle = 1;
246 252
247 if (need_resched()) 253 if (need_resched())
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..0b36b9e5cc8b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61071fecc82e..1551f47e7669 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
18config HAVE_FUNCTION_GRAPH_TRACER 18config HAVE_FUNCTION_GRAPH_TRACER
19 bool 19 bool
20 20
21config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool
23 help
24 An arch may pass in a unique value (frame pointer) to both the
25 entering and exiting of a function. On exit, the value is compared
26 and if it does not match, then it will panic the kernel.
27
21config HAVE_FUNCTION_TRACE_MCOUNT_TEST 28config HAVE_FUNCTION_TRACE_MCOUNT_TEST
22 bool 29 bool
23 help 30 help
@@ -121,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
121 bool "Kernel Function Graph Tracer" 128 bool "Kernel Function Graph Tracer"
122 depends on HAVE_FUNCTION_GRAPH_TRACER 129 depends on HAVE_FUNCTION_GRAPH_TRACER
123 depends on FUNCTION_TRACER 130 depends on FUNCTION_TRACER
131 depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
124 default y 132 default y
125 help 133 help
126 Enable the kernel to trace a function at both its return 134 Enable the kernel to trace a function at both its return
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb60732ade0c..f3716bf04df6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -1224,6 +1226,13 @@ static void ftrace_shutdown(int command)
1224 return; 1226 return;
1225 1227
1226 ftrace_start_up--; 1228 ftrace_start_up--;
1229 /*
1230 * Just warn in case of unbalance, no need to kill ftrace, it's not
1231 * critical but the ftrace_call callers may be never nopped again after
1232 * further ftrace uses.
1233 */
1234 WARN_ON_ONCE(ftrace_start_up < 0);
1235
1227 if (!ftrace_start_up) 1236 if (!ftrace_start_up)
1228 command |= FTRACE_DISABLE_CALLS; 1237 command |= FTRACE_DISABLE_CALLS;
1229 1238
@@ -1410,10 +1419,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410{ 1419{
1411 struct ftrace_iterator *iter = m->private; 1420 struct ftrace_iterator *iter = m->private;
1412 void *p = NULL; 1421 void *p = NULL;
1422 loff_t l;
1423
1424 if (!(iter->flags & FTRACE_ITER_HASH))
1425 *pos = 0;
1413 1426
1414 iter->flags |= FTRACE_ITER_HASH; 1427 iter->flags |= FTRACE_ITER_HASH;
1415 1428
1416 return t_hash_next(m, p, pos); 1429 iter->hidx = 0;
1430 for (l = 0; l <= *pos; ) {
1431 p = t_hash_next(m, p, &l);
1432 if (!p)
1433 break;
1434 }
1435 return p;
1417} 1436}
1418 1437
1419static int t_hash_show(struct seq_file *m, void *v) 1438static int t_hash_show(struct seq_file *m, void *v)
@@ -1460,8 +1479,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1460 iter->pg = iter->pg->next; 1479 iter->pg = iter->pg->next;
1461 iter->idx = 0; 1480 iter->idx = 0;
1462 goto retry; 1481 goto retry;
1463 } else {
1464 iter->idx = -1;
1465 } 1482 }
1466 } else { 1483 } else {
1467 rec = &iter->pg->records[iter->idx++]; 1484 rec = &iter->pg->records[iter->idx++];
@@ -1490,6 +1507,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1490{ 1507{
1491 struct ftrace_iterator *iter = m->private; 1508 struct ftrace_iterator *iter = m->private;
1492 void *p = NULL; 1509 void *p = NULL;
1510 loff_t l;
1493 1511
1494 mutex_lock(&ftrace_lock); 1512 mutex_lock(&ftrace_lock);
1495 /* 1513 /*
@@ -1501,23 +1519,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1501 if (*pos > 0) 1519 if (*pos > 0)
1502 return t_hash_start(m, pos); 1520 return t_hash_start(m, pos);
1503 iter->flags |= FTRACE_ITER_PRINTALL; 1521 iter->flags |= FTRACE_ITER_PRINTALL;
1504 (*pos)++;
1505 return iter; 1522 return iter;
1506 } 1523 }
1507 1524
1508 if (iter->flags & FTRACE_ITER_HASH) 1525 if (iter->flags & FTRACE_ITER_HASH)
1509 return t_hash_start(m, pos); 1526 return t_hash_start(m, pos);
1510 1527
1511 if (*pos > 0) { 1528 iter->pg = ftrace_pages_start;
1512 if (iter->idx < 0) 1529 iter->idx = 0;
1513 return p; 1530 for (l = 0; l <= *pos; ) {
1514 (*pos)--; 1531 p = t_next(m, p, &l);
1515 iter->idx--; 1532 if (!p)
1533 break;
1516 } 1534 }
1517 1535
1518 p = t_next(m, p, pos); 1536 if (!p && iter->flags & FTRACE_ITER_FILTER)
1519
1520 if (!p)
1521 return t_hash_start(m, pos); 1537 return t_hash_start(m, pos);
1522 1538
1523 return p; 1539 return p;
@@ -2493,32 +2509,31 @@ int ftrace_graph_count;
2493unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2509unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2494 2510
2495static void * 2511static void *
2496g_next(struct seq_file *m, void *v, loff_t *pos) 2512__g_next(struct seq_file *m, loff_t *pos)
2497{ 2513{
2498 unsigned long *array = m->private; 2514 unsigned long *array = m->private;
2499 int index = *pos;
2500
2501 (*pos)++;
2502 2515
2503 if (index >= ftrace_graph_count) 2516 if (*pos >= ftrace_graph_count)
2504 return NULL; 2517 return NULL;
2518 return &array[*pos];
2519}
2505 2520
2506 return &array[index]; 2521static void *
2522g_next(struct seq_file *m, void *v, loff_t *pos)
2523{
2524 (*pos)++;
2525 return __g_next(m, pos);
2507} 2526}
2508 2527
2509static void *g_start(struct seq_file *m, loff_t *pos) 2528static void *g_start(struct seq_file *m, loff_t *pos)
2510{ 2529{
2511 void *p = NULL;
2512
2513 mutex_lock(&graph_lock); 2530 mutex_lock(&graph_lock);
2514 2531
2515 /* Nothing, tell g_show to print all functions are enabled */ 2532 /* Nothing, tell g_show to print all functions are enabled */
2516 if (!ftrace_graph_count && !*pos) 2533 if (!ftrace_graph_count && !*pos)
2517 return (void *)1; 2534 return (void *)1;
2518 2535
2519 p = g_next(m, p, pos); 2536 return __g_next(m, pos);
2520
2521 return p;
2522} 2537}
2523 2538
2524static void g_stop(struct seq_file *m, void *p) 2539static void g_stop(struct seq_file *m, void *p)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 86cdf671d7e2..1edaa9516e81 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -186,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
186 int cpu; 186 int cpu;
187 kmemtrace_array = tr; 187 kmemtrace_array = tr;
188 188
189 for_each_cpu_mask(cpu, cpu_possible_map) 189 for_each_cpu(cpu, cpu_possible_mask)
190 tracing_reset(tr, cpu); 190 tracing_reset(tr, cpu);
191 191
192 kmemtrace_start_probes(); 192 kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc4dc70171ce..bf27bb7a63e2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
209 210
210/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
211#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -415,6 +416,8 @@ struct ring_buffer_per_cpu {
415 unsigned long overrun; 416 unsigned long overrun;
416 unsigned long read; 417 unsigned long read;
417 local_t entries; 418 local_t entries;
419 local_t committing;
420 local_t commits;
418 u64 write_stamp; 421 u64 write_stamp;
419 u64 read_stamp; 422 u64 read_stamp;
420 atomic_t record_disabled; 423 atomic_t record_disabled;
@@ -618,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
618 kfree(cpu_buffer); 621 kfree(cpu_buffer);
619} 622}
620 623
621/*
622 * Causes compile errors if the struct buffer_page gets bigger
623 * than the struct page.
624 */
625extern int ring_buffer_page_too_big(void);
626
627#ifdef CONFIG_HOTPLUG_CPU 624#ifdef CONFIG_HOTPLUG_CPU
628static int rb_cpu_notify(struct notifier_block *self, 625static int rb_cpu_notify(struct notifier_block *self,
629 unsigned long action, void *hcpu); 626 unsigned long action, void *hcpu);
@@ -646,11 +643,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
646 int bsize; 643 int bsize;
647 int cpu; 644 int cpu;
648 645
649 /* Paranoid! Optimizes out when all is well */
650 if (sizeof(struct buffer_page) > sizeof(struct page))
651 ring_buffer_page_too_big();
652
653
654 /* keep it in its own cache line */ 646 /* keep it in its own cache line */
655 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 647 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
656 GFP_KERNEL); 648 GFP_KERNEL);
@@ -666,8 +658,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
666 buffer->reader_lock_key = key; 658 buffer->reader_lock_key = key;
667 659
668 /* need at least two pages */ 660 /* need at least two pages */
669 if (buffer->pages == 1) 661 if (buffer->pages < 2)
670 buffer->pages++; 662 buffer->pages = 2;
671 663
672 /* 664 /*
673 * In case of non-hotplug cpu, if the ring-buffer is allocated 665 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1011,12 +1003,12 @@ rb_event_index(struct ring_buffer_event *event)
1011{ 1003{
1012 unsigned long addr = (unsigned long)event; 1004 unsigned long addr = (unsigned long)event;
1013 1005
1014 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1006 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1015} 1007}
1016 1008
1017static inline int 1009static inline int
1018rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1010rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1019 struct ring_buffer_event *event) 1011 struct ring_buffer_event *event)
1020{ 1012{
1021 unsigned long addr = (unsigned long)event; 1013 unsigned long addr = (unsigned long)event;
1022 unsigned long index; 1014 unsigned long index;
@@ -1029,31 +1021,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1029} 1021}
1030 1022
1031static void 1023static void
1032rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1033 struct ring_buffer_event *event)
1034{
1035 unsigned long addr = (unsigned long)event;
1036 unsigned long index;
1037
1038 index = rb_event_index(event);
1039 addr &= PAGE_MASK;
1040
1041 while (cpu_buffer->commit_page->page != (void *)addr) {
1042 if (RB_WARN_ON(cpu_buffer,
1043 cpu_buffer->commit_page == cpu_buffer->tail_page))
1044 return;
1045 cpu_buffer->commit_page->page->commit =
1046 cpu_buffer->commit_page->write;
1047 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1048 cpu_buffer->write_stamp =
1049 cpu_buffer->commit_page->page->time_stamp;
1050 }
1051
1052 /* Now set the commit to the event's index */
1053 local_set(&cpu_buffer->commit_page->page->commit, index);
1054}
1055
1056static void
1057rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1024rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1058{ 1025{
1059 /* 1026 /*
@@ -1171,6 +1138,60 @@ static unsigned rb_calculate_event_length(unsigned length)
1171 return length; 1138 return length;
1172} 1139}
1173 1140
1141static inline void
1142rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1143 struct buffer_page *tail_page,
1144 unsigned long tail, unsigned long length)
1145{
1146 struct ring_buffer_event *event;
1147
1148 /*
1149 * Only the event that crossed the page boundary
1150 * must fill the old tail_page with padding.
1151 */
1152 if (tail >= BUF_PAGE_SIZE) {
1153 local_sub(length, &tail_page->write);
1154 return;
1155 }
1156
1157 event = __rb_page_index(tail_page, tail);
1158 kmemcheck_annotate_bitfield(event, bitfield);
1159
1160 /*
1161 * If this event is bigger than the minimum size, then
1162 * we need to be careful that we don't subtract the
1163 * write counter enough to allow another writer to slip
1164 * in on this page.
1165 * We put in a discarded commit instead, to make sure
1166 * that this space is not used again.
1167 *
1168 * If we are less than the minimum size, we don't need to
1169 * worry about it.
1170 */
1171 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1172 /* No room for any events */
1173
1174 /* Mark the rest of the page with padding */
1175 rb_event_set_padding(event);
1176
1177 /* Set the write back to the previous setting */
1178 local_sub(length, &tail_page->write);
1179 return;
1180 }
1181
1182 /* Put in a discarded event */
1183 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1184 event->type_len = RINGBUF_TYPE_PADDING;
1185 /* time delta must be non zero */
1186 event->time_delta = 1;
1187 /* Account for this as an entry */
1188 local_inc(&tail_page->entries);
1189 local_inc(&cpu_buffer->entries);
1190
1191 /* Set write to end of buffer */
1192 length = (tail + length) - BUF_PAGE_SIZE;
1193 local_sub(length, &tail_page->write);
1194}
1174 1195
1175static struct ring_buffer_event * 1196static struct ring_buffer_event *
1176rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1197rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1180,7 +1201,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1180{ 1201{
1181 struct buffer_page *next_page, *head_page, *reader_page; 1202 struct buffer_page *next_page, *head_page, *reader_page;
1182 struct ring_buffer *buffer = cpu_buffer->buffer; 1203 struct ring_buffer *buffer = cpu_buffer->buffer;
1183 struct ring_buffer_event *event;
1184 bool lock_taken = false; 1204 bool lock_taken = false;
1185 unsigned long flags; 1205 unsigned long flags;
1186 1206
@@ -1265,27 +1285,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1265 cpu_buffer->tail_page->page->time_stamp = *ts; 1285 cpu_buffer->tail_page->page->time_stamp = *ts;
1266 } 1286 }
1267 1287
1268 /* 1288 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1269 * The actual tail page has moved forward.
1270 */
1271 if (tail < BUF_PAGE_SIZE) {
1272 /* Mark the rest of the page with padding */
1273 event = __rb_page_index(tail_page, tail);
1274 kmemcheck_annotate_bitfield(event, bitfield);
1275 rb_event_set_padding(event);
1276 }
1277
1278 /* Set the write back to the previous setting */
1279 local_sub(length, &tail_page->write);
1280
1281 /*
1282 * If this was a commit entry that failed,
1283 * increment that too
1284 */
1285 if (tail_page == cpu_buffer->commit_page &&
1286 tail == rb_commit_index(cpu_buffer)) {
1287 rb_set_commit_to_write(cpu_buffer);
1288 }
1289 1289
1290 __raw_spin_unlock(&cpu_buffer->lock); 1290 __raw_spin_unlock(&cpu_buffer->lock);
1291 local_irq_restore(flags); 1291 local_irq_restore(flags);
@@ -1295,7 +1295,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1295 1295
1296 out_reset: 1296 out_reset:
1297 /* reset write */ 1297 /* reset write */
1298 local_sub(length, &tail_page->write); 1298 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299 1299
1300 if (likely(lock_taken)) 1300 if (likely(lock_taken))
1301 __raw_spin_unlock(&cpu_buffer->lock); 1301 __raw_spin_unlock(&cpu_buffer->lock);
@@ -1325,9 +1325,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1325 1325
1326 /* We reserved something on the buffer */ 1326 /* We reserved something on the buffer */
1327 1327
1328 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1329 return NULL;
1330
1331 event = __rb_page_index(tail_page, tail); 1328 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield); 1329 kmemcheck_annotate_bitfield(event, bitfield);
1333 rb_update_event(event, type, length); 1330 rb_update_event(event, type, length);
@@ -1337,11 +1334,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1337 local_inc(&tail_page->entries); 1334 local_inc(&tail_page->entries);
1338 1335
1339 /* 1336 /*
1340 * If this is a commit and the tail is zero, then update 1337 * If this is the first commit on the page, then update
1341 * this page's time stamp. 1338 * its timestamp.
1342 */ 1339 */
1343 if (!tail && rb_is_commit(cpu_buffer, event)) 1340 if (!tail)
1344 cpu_buffer->commit_page->page->time_stamp = *ts; 1341 tail_page->page->time_stamp = *ts;
1345 1342
1346 return event; 1343 return event;
1347} 1344}
@@ -1410,16 +1407,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1410 return -EAGAIN; 1407 return -EAGAIN;
1411 1408
1412 /* Only a commited time event can update the write stamp */ 1409 /* Only a commited time event can update the write stamp */
1413 if (rb_is_commit(cpu_buffer, event)) { 1410 if (rb_event_is_commit(cpu_buffer, event)) {
1414 /* 1411 /*
1415 * If this is the first on the page, then we need to 1412 * If this is the first on the page, then it was
1416 * update the page itself, and just put in a zero. 1413 * updated with the page itself. Try to discard it
1414 * and if we can't just make it zero.
1417 */ 1415 */
1418 if (rb_event_index(event)) { 1416 if (rb_event_index(event)) {
1419 event->time_delta = *delta & TS_MASK; 1417 event->time_delta = *delta & TS_MASK;
1420 event->array[0] = *delta >> TS_SHIFT; 1418 event->array[0] = *delta >> TS_SHIFT;
1421 } else { 1419 } else {
1422 cpu_buffer->commit_page->page->time_stamp = *ts;
1423 /* try to discard, since we do not need this */ 1420 /* try to discard, since we do not need this */
1424 if (!rb_try_to_discard(cpu_buffer, event)) { 1421 if (!rb_try_to_discard(cpu_buffer, event)) {
1425 /* nope, just zero it */ 1422 /* nope, just zero it */
@@ -1445,6 +1442,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1445 return ret; 1442 return ret;
1446} 1443}
1447 1444
1445static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
1446{
1447 local_inc(&cpu_buffer->committing);
1448 local_inc(&cpu_buffer->commits);
1449}
1450
1451static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1452{
1453 unsigned long commits;
1454
1455 if (RB_WARN_ON(cpu_buffer,
1456 !local_read(&cpu_buffer->committing)))
1457 return;
1458
1459 again:
1460 commits = local_read(&cpu_buffer->commits);
1461 /* synchronize with interrupts */
1462 barrier();
1463 if (local_read(&cpu_buffer->committing) == 1)
1464 rb_set_commit_to_write(cpu_buffer);
1465
1466 local_dec(&cpu_buffer->committing);
1467
1468 /* synchronize with interrupts */
1469 barrier();
1470
1471 /*
1472 * Need to account for interrupts coming in between the
1473 * updating of the commit page and the clearing of the
1474 * committing counter.
1475 */
1476 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
1477 !local_read(&cpu_buffer->committing)) {
1478 local_inc(&cpu_buffer->committing);
1479 goto again;
1480 }
1481}
1482
1448static struct ring_buffer_event * 1483static struct ring_buffer_event *
1449rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1484rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1450 unsigned long length) 1485 unsigned long length)
@@ -1454,6 +1489,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1454 int commit = 0; 1489 int commit = 0;
1455 int nr_loops = 0; 1490 int nr_loops = 0;
1456 1491
1492 rb_start_commit(cpu_buffer);
1493
1457 length = rb_calculate_event_length(length); 1494 length = rb_calculate_event_length(length);
1458 again: 1495 again:
1459 /* 1496 /*
@@ -1466,7 +1503,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1466 * Bail! 1503 * Bail!
1467 */ 1504 */
1468 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1505 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1469 return NULL; 1506 goto out_fail;
1470 1507
1471 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 1508 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1472 1509
@@ -1497,7 +1534,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1497 1534
1498 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1535 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1499 if (commit == -EBUSY) 1536 if (commit == -EBUSY)
1500 return NULL; 1537 goto out_fail;
1501 1538
1502 if (commit == -EAGAIN) 1539 if (commit == -EAGAIN)
1503 goto again; 1540 goto again;
@@ -1511,30 +1548,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1511 if (unlikely(PTR_ERR(event) == -EAGAIN)) 1548 if (unlikely(PTR_ERR(event) == -EAGAIN))
1512 goto again; 1549 goto again;
1513 1550
1514 if (!event) { 1551 if (!event)
1515 if (unlikely(commit)) 1552 goto out_fail;
1516 /*
1517 * Ouch! We needed a timestamp and it was commited. But
1518 * we didn't get our event reserved.
1519 */
1520 rb_set_commit_to_write(cpu_buffer);
1521 return NULL;
1522 }
1523 1553
1524 /* 1554 if (!rb_event_is_commit(cpu_buffer, event))
1525 * If the timestamp was commited, make the commit our entry
1526 * now so that we will update it when needed.
1527 */
1528 if (unlikely(commit))
1529 rb_set_commit_event(cpu_buffer, event);
1530 else if (!rb_is_commit(cpu_buffer, event))
1531 delta = 0; 1555 delta = 0;
1532 1556
1533 event->time_delta = delta; 1557 event->time_delta = delta;
1534 1558
1535 return event; 1559 return event;
1560
1561 out_fail:
1562 rb_end_commit(cpu_buffer);
1563 return NULL;
1536} 1564}
1537 1565
1566#ifdef CONFIG_TRACING
1567
1538#define TRACE_RECURSIVE_DEPTH 16 1568#define TRACE_RECURSIVE_DEPTH 16
1539 1569
1540static int trace_recursive_lock(void) 1570static int trace_recursive_lock(void)
@@ -1565,6 +1595,13 @@ static void trace_recursive_unlock(void)
1565 current->trace_recursion--; 1595 current->trace_recursion--;
1566} 1596}
1567 1597
1598#else
1599
1600#define trace_recursive_lock() (0)
1601#define trace_recursive_unlock() do { } while (0)
1602
1603#endif
1604
1568static DEFINE_PER_CPU(int, rb_need_resched); 1605static DEFINE_PER_CPU(int, rb_need_resched);
1569 1606
1570/** 1607/**
@@ -1642,13 +1679,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1642{ 1679{
1643 local_inc(&cpu_buffer->entries); 1680 local_inc(&cpu_buffer->entries);
1644 1681
1645 /* Only process further if we own the commit */ 1682 /*
1646 if (!rb_is_commit(cpu_buffer, event)) 1683 * The event first in the commit queue updates the
1647 return; 1684 * time stamp.
1648 1685 */
1649 cpu_buffer->write_stamp += event->time_delta; 1686 if (rb_event_is_commit(cpu_buffer, event))
1687 cpu_buffer->write_stamp += event->time_delta;
1650 1688
1651 rb_set_commit_to_write(cpu_buffer); 1689 rb_end_commit(cpu_buffer);
1652} 1690}
1653 1691
1654/** 1692/**
@@ -1737,15 +1775,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1737 /* The event is discarded regardless */ 1775 /* The event is discarded regardless */
1738 rb_event_discard(event); 1776 rb_event_discard(event);
1739 1777
1778 cpu = smp_processor_id();
1779 cpu_buffer = buffer->buffers[cpu];
1780
1740 /* 1781 /*
1741 * This must only be called if the event has not been 1782 * This must only be called if the event has not been
1742 * committed yet. Thus we can assume that preemption 1783 * committed yet. Thus we can assume that preemption
1743 * is still disabled. 1784 * is still disabled.
1744 */ 1785 */
1745 RB_WARN_ON(buffer, preemptible()); 1786 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1746
1747 cpu = smp_processor_id();
1748 cpu_buffer = buffer->buffers[cpu];
1749 1787
1750 if (!rb_try_to_discard(cpu_buffer, event)) 1788 if (!rb_try_to_discard(cpu_buffer, event))
1751 goto out; 1789 goto out;
@@ -1756,13 +1794,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1756 */ 1794 */
1757 local_inc(&cpu_buffer->entries); 1795 local_inc(&cpu_buffer->entries);
1758 out: 1796 out:
1759 /* 1797 rb_end_commit(cpu_buffer);
1760 * If a write came in and pushed the tail page
1761 * we still need to update the commit pointer
1762 * if we were the commit.
1763 */
1764 if (rb_is_commit(cpu_buffer, event))
1765 rb_set_commit_to_write(cpu_buffer);
1766 1798
1767 trace_recursive_unlock(); 1799 trace_recursive_unlock();
1768 1800
@@ -2446,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2446} 2478}
2447EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 2479EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2448 2480
2481static inline int rb_ok_to_lock(void)
2482{
2483 /*
2484 * If an NMI die dumps out the content of the ring buffer
2485 * do not grab locks. We also permanently disable the ring
2486 * buffer too. A one time deal is all you get from reading
2487 * the ring buffer from an NMI.
2488 */
2489 if (likely(!in_nmi() && !oops_in_progress))
2490 return 1;
2491
2492 tracing_off_permanent();
2493 return 0;
2494}
2495
2449/** 2496/**
2450 * ring_buffer_peek - peek at the next event to be read 2497 * ring_buffer_peek - peek at the next event to be read
2451 * @buffer: The ring buffer to read 2498 * @buffer: The ring buffer to read
@@ -2461,14 +2508,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2461 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2508 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2462 struct ring_buffer_event *event; 2509 struct ring_buffer_event *event;
2463 unsigned long flags; 2510 unsigned long flags;
2511 int dolock;
2464 2512
2465 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2513 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2466 return NULL; 2514 return NULL;
2467 2515
2516 dolock = rb_ok_to_lock();
2468 again: 2517 again:
2469 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2518 local_irq_save(flags);
2519 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock);
2470 event = rb_buffer_peek(buffer, cpu, ts); 2521 event = rb_buffer_peek(buffer, cpu, ts);
2471 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2522 if (dolock)
2523 spin_unlock(&cpu_buffer->reader_lock);
2524 local_irq_restore(flags);
2472 2525
2473 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 2526 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2474 cpu_relax(); 2527 cpu_relax();
@@ -2520,6 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2520 struct ring_buffer_per_cpu *cpu_buffer; 2573 struct ring_buffer_per_cpu *cpu_buffer;
2521 struct ring_buffer_event *event = NULL; 2574 struct ring_buffer_event *event = NULL;
2522 unsigned long flags; 2575 unsigned long flags;
2576 int dolock;
2577
2578 dolock = rb_ok_to_lock();
2523 2579
2524 again: 2580 again:
2525 /* might be called in atomic */ 2581 /* might be called in atomic */
@@ -2529,7 +2585,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2529 goto out; 2585 goto out;
2530 2586
2531 cpu_buffer = buffer->buffers[cpu]; 2587 cpu_buffer = buffer->buffers[cpu];
2532 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2588 local_irq_save(flags);
2589 if (dolock)
2590 spin_lock(&cpu_buffer->reader_lock);
2533 2591
2534 event = rb_buffer_peek(buffer, cpu, ts); 2592 event = rb_buffer_peek(buffer, cpu, ts);
2535 if (!event) 2593 if (!event)
@@ -2538,7 +2596,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2538 rb_advance_reader(cpu_buffer); 2596 rb_advance_reader(cpu_buffer);
2539 2597
2540 out_unlock: 2598 out_unlock:
2541 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2599 if (dolock)
2600 spin_unlock(&cpu_buffer->reader_lock);
2601 local_irq_restore(flags);
2542 2602
2543 out: 2603 out:
2544 preempt_enable(); 2604 preempt_enable();
@@ -2680,6 +2740,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2680 cpu_buffer->overrun = 0; 2740 cpu_buffer->overrun = 0;
2681 cpu_buffer->read = 0; 2741 cpu_buffer->read = 0;
2682 local_set(&cpu_buffer->entries, 0); 2742 local_set(&cpu_buffer->entries, 0);
2743 local_set(&cpu_buffer->committing, 0);
2744 local_set(&cpu_buffer->commits, 0);
2683 2745
2684 cpu_buffer->write_stamp = 0; 2746 cpu_buffer->write_stamp = 0;
2685 cpu_buffer->read_stamp = 0; 2747 cpu_buffer->read_stamp = 0;
@@ -2734,12 +2796,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2734int ring_buffer_empty(struct ring_buffer *buffer) 2796int ring_buffer_empty(struct ring_buffer *buffer)
2735{ 2797{
2736 struct ring_buffer_per_cpu *cpu_buffer; 2798 struct ring_buffer_per_cpu *cpu_buffer;
2799 unsigned long flags;
2800 int dolock;
2737 int cpu; 2801 int cpu;
2802 int ret;
2803
2804 dolock = rb_ok_to_lock();
2738 2805
2739 /* yes this is racy, but if you don't like the race, lock the buffer */ 2806 /* yes this is racy, but if you don't like the race, lock the buffer */
2740 for_each_buffer_cpu(buffer, cpu) { 2807 for_each_buffer_cpu(buffer, cpu) {
2741 cpu_buffer = buffer->buffers[cpu]; 2808 cpu_buffer = buffer->buffers[cpu];
2742 if (!rb_per_cpu_empty(cpu_buffer)) 2809 local_irq_save(flags);
2810 if (dolock)
2811 spin_lock(&cpu_buffer->reader_lock);
2812 ret = rb_per_cpu_empty(cpu_buffer);
2813 if (dolock)
2814 spin_unlock(&cpu_buffer->reader_lock);
2815 local_irq_restore(flags);
2816
2817 if (!ret)
2743 return 0; 2818 return 0;
2744 } 2819 }
2745 2820
@@ -2755,14 +2830,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2755int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2830int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2756{ 2831{
2757 struct ring_buffer_per_cpu *cpu_buffer; 2832 struct ring_buffer_per_cpu *cpu_buffer;
2833 unsigned long flags;
2834 int dolock;
2758 int ret; 2835 int ret;
2759 2836
2760 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2837 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2761 return 1; 2838 return 1;
2762 2839
2840 dolock = rb_ok_to_lock();
2841
2763 cpu_buffer = buffer->buffers[cpu]; 2842 cpu_buffer = buffer->buffers[cpu];
2843 local_irq_save(flags);
2844 if (dolock)
2845 spin_lock(&cpu_buffer->reader_lock);
2764 ret = rb_per_cpu_empty(cpu_buffer); 2846 ret = rb_per_cpu_empty(cpu_buffer);
2765 2847 if (dolock)
2848 spin_unlock(&cpu_buffer->reader_lock);
2849 local_irq_restore(flags);
2766 2850
2767 return ret; 2851 return ret;
2768} 2852}
@@ -3029,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3029} 3113}
3030EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3114EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3031 3115
3116#ifdef CONFIG_TRACING
3032static ssize_t 3117static ssize_t
3033rb_simple_read(struct file *filp, char __user *ubuf, 3118rb_simple_read(struct file *filp, char __user *ubuf,
3034 size_t cnt, loff_t *ppos) 3119 size_t cnt, loff_t *ppos)
@@ -3096,6 +3181,7 @@ static __init int rb_init_debugfs(void)
3096} 3181}
3097 3182
3098fs_initcall(rb_init_debugfs); 3183fs_initcall(rb_init_debugfs);
3184#endif
3099 3185
3100#ifdef CONFIG_HOTPLUG_CPU 3186#ifdef CONFIG_HOTPLUG_CPU
3101static int rb_cpu_notify(struct notifier_block *self, 3187static int rb_cpu_notify(struct notifier_block *self,
@@ -3108,7 +3194,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3108 switch (action) { 3194 switch (action) {
3109 case CPU_UP_PREPARE: 3195 case CPU_UP_PREPARE:
3110 case CPU_UP_PREPARE_FROZEN: 3196 case CPU_UP_PREPARE_FROZEN:
3111 if (cpu_isset(cpu, *buffer->cpumask)) 3197 if (cpumask_test_cpu(cpu, buffer->cpumask))
3112 return NOTIFY_OK; 3198 return NOTIFY_OK;
3113 3199
3114 buffer->buffers[cpu] = 3200 buffer->buffers[cpu] =
@@ -3119,7 +3205,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3119 return NOTIFY_OK; 3205 return NOTIFY_OK;
3120 } 3206 }
3121 smp_wmb(); 3207 smp_wmb();
3122 cpu_set(cpu, *buffer->cpumask); 3208 cpumask_set_cpu(cpu, buffer->cpumask);
3123 break; 3209 break;
3124 case CPU_DOWN_PREPARE: 3210 case CPU_DOWN_PREPARE:
3125 case CPU_DOWN_PREPARE_FROZEN: 3211 case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8d68e149a8b3..573d3cc762c3 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -102,8 +102,10 @@ static enum event_status read_page(int cpu)
102 event = (void *)&rpage->data[i]; 102 event = (void *)&rpage->data[i];
103 switch (event->type_len) { 103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING: 104 case RINGBUF_TYPE_PADDING:
105 /* We don't expect any padding */ 105 /* failed writes may be discarded events */
106 KILL_TEST(); 106 if (!event->time_delta)
107 KILL_TEST();
108 inc = event->array[0] + 4;
107 break; 109 break;
108 case RINGBUF_TYPE_TIME_EXTEND: 110 case RINGBUF_TYPE_TIME_EXTEND:
109 inc = 8; 111 inc = 8;
@@ -119,7 +121,7 @@ static enum event_status read_page(int cpu)
119 KILL_TEST(); 121 KILL_TEST();
120 break; 122 break;
121 } 123 }
122 inc = event->array[0]; 124 inc = event->array[0] + 4;
123 break; 125 break;
124 default: 126 default:
125 entry = ring_buffer_event_data(event); 127 entry = ring_buffer_event_data(event);
@@ -201,7 +203,7 @@ static void ring_buffer_producer(void)
201 * Hammer the buffer for 10 secs (this may 203 * Hammer the buffer for 10 secs (this may
202 * make the system stall) 204 * make the system stall)
203 */ 205 */
204 pr_info("Starting ring buffer hammer\n"); 206 trace_printk("Starting ring buffer hammer\n");
205 do_gettimeofday(&start_tv); 207 do_gettimeofday(&start_tv);
206 do { 208 do {
207 struct ring_buffer_event *event; 209 struct ring_buffer_event *event;
@@ -237,7 +239,7 @@ static void ring_buffer_producer(void)
237#endif 239#endif
238 240
239 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 241 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
240 pr_info("End ring buffer hammer\n"); 242 trace_printk("End ring buffer hammer\n");
241 243
242 if (consumer) { 244 if (consumer) {
243 /* Init both completions here to avoid races */ 245 /* Init both completions here to avoid races */
@@ -260,49 +262,50 @@ static void ring_buffer_producer(void)
260 overruns = ring_buffer_overruns(buffer); 262 overruns = ring_buffer_overruns(buffer);
261 263
262 if (kill_test) 264 if (kill_test)
263 pr_info("ERROR!\n"); 265 trace_printk("ERROR!\n");
264 pr_info("Time: %lld (usecs)\n", time); 266 trace_printk("Time: %lld (usecs)\n", time);
265 pr_info("Overruns: %lld\n", overruns); 267 trace_printk("Overruns: %lld\n", overruns);
266 if (disable_reader) 268 if (disable_reader)
267 pr_info("Read: (reader disabled)\n"); 269 trace_printk("Read: (reader disabled)\n");
268 else 270 else
269 pr_info("Read: %ld (by %s)\n", read, 271 trace_printk("Read: %ld (by %s)\n", read,
270 read_events ? "events" : "pages"); 272 read_events ? "events" : "pages");
271 pr_info("Entries: %lld\n", entries); 273 trace_printk("Entries: %lld\n", entries);
272 pr_info("Total: %lld\n", entries + overruns + read); 274 trace_printk("Total: %lld\n", entries + overruns + read);
273 pr_info("Missed: %ld\n", missed); 275 trace_printk("Missed: %ld\n", missed);
274 pr_info("Hit: %ld\n", hit); 276 trace_printk("Hit: %ld\n", hit);
275 277
276 /* Convert time from usecs to millisecs */ 278 /* Convert time from usecs to millisecs */
277 do_div(time, USEC_PER_MSEC); 279 do_div(time, USEC_PER_MSEC);
278 if (time) 280 if (time)
279 hit /= (long)time; 281 hit /= (long)time;
280 else 282 else
281 pr_info("TIME IS ZERO??\n"); 283 trace_printk("TIME IS ZERO??\n");
282 284
283 pr_info("Entries per millisec: %ld\n", hit); 285 trace_printk("Entries per millisec: %ld\n", hit);
284 286
285 if (hit) { 287 if (hit) {
286 /* Calculate the average time in nanosecs */ 288 /* Calculate the average time in nanosecs */
287 avg = NSEC_PER_MSEC / hit; 289 avg = NSEC_PER_MSEC / hit;
288 pr_info("%ld ns per entry\n", avg); 290 trace_printk("%ld ns per entry\n", avg);
289 } 291 }
290 292
291 if (missed) { 293 if (missed) {
292 if (time) 294 if (time)
293 missed /= (long)time; 295 missed /= (long)time;
294 296
295 pr_info("Total iterations per millisec: %ld\n", hit + missed); 297 trace_printk("Total iterations per millisec: %ld\n",
298 hit + missed);
296 299
297 /* it is possible that hit + missed will overflow and be zero */ 300 /* it is possible that hit + missed will overflow and be zero */
298 if (!(hit + missed)) { 301 if (!(hit + missed)) {
299 pr_info("hit + missed overflowed and totalled zero!\n"); 302 trace_printk("hit + missed overflowed and totalled zero!\n");
300 hit--; /* make it non zero */ 303 hit--; /* make it non zero */
301 } 304 }
302 305
303 /* Caculate the average time in nanosecs */ 306 /* Caculate the average time in nanosecs */
304 avg = NSEC_PER_MSEC / (hit + missed); 307 avg = NSEC_PER_MSEC / (hit + missed);
305 pr_info("%ld ns per entry\n", avg); 308 trace_printk("%ld ns per entry\n", avg);
306 } 309 }
307} 310}
308 311
@@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg)
353 356
354 ring_buffer_producer(); 357 ring_buffer_producer();
355 358
356 pr_info("Sleeping for 10 secs\n"); 359 trace_printk("Sleeping for 10 secs\n");
357 set_current_state(TASK_INTERRUPTIBLE); 360 set_current_state(TASK_INTERRUPTIBLE);
358 schedule_timeout(HZ * SLEEP_TIME); 361 schedule_timeout(HZ * SLEEP_TIME);
359 __set_current_state(TASK_RUNNING); 362 __set_current_state(TASK_RUNNING);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c1878bfb2e1e..3aa0a0dfdfa8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -284,13 +284,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 284static int __init set_buf_size(char *str)
285{ 285{
286 unsigned long buf_size; 286 unsigned long buf_size;
287 int ret;
288 287
289 if (!str) 288 if (!str)
290 return 0; 289 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 290 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 291 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 292 if (buf_size == 0)
294 return 0; 293 return 0;
295 trace_buf_size = buf_size; 294 trace_buf_size = buf_size;
296 return 1; 295 return 1;
@@ -2053,25 +2052,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2052static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2053t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2054{
2056 struct tracer *t = m->private; 2055 struct tracer *t = v;
2057 2056
2058 (*pos)++; 2057 (*pos)++;
2059 2058
2060 if (t) 2059 if (t)
2061 t = t->next; 2060 t = t->next;
2062 2061
2063 m->private = t;
2064
2065 return t; 2062 return t;
2066} 2063}
2067 2064
2068static void *t_start(struct seq_file *m, loff_t *pos) 2065static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2066{
2070 struct tracer *t = m->private; 2067 struct tracer *t;
2071 loff_t l = 0; 2068 loff_t l = 0;
2072 2069
2073 mutex_lock(&trace_types_lock); 2070 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2071 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2072 ;
2076 2073
2077 return t; 2074 return t;
@@ -2107,18 +2104,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2104
2108static int show_traces_open(struct inode *inode, struct file *file) 2105static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2106{
2110 int ret;
2111
2112 if (tracing_disabled) 2107 if (tracing_disabled)
2113 return -ENODEV; 2108 return -ENODEV;
2114 2109
2115 ret = seq_open(file, &show_traces_seq_ops); 2110 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2111}
2123 2112
2124static ssize_t 2113static ssize_t
@@ -2191,11 +2180,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2191 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 2180 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2192 return -ENOMEM; 2181 return -ENOMEM;
2193 2182
2194 mutex_lock(&tracing_cpumask_update_lock);
2195 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2183 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2196 if (err) 2184 if (err)
2197 goto err_unlock; 2185 goto err_unlock;
2198 2186
2187 mutex_lock(&tracing_cpumask_update_lock);
2188
2199 local_irq_disable(); 2189 local_irq_disable();
2200 __raw_spin_lock(&ftrace_max_lock); 2190 __raw_spin_lock(&ftrace_max_lock);
2201 for_each_tracing_cpu(cpu) { 2191 for_each_tracing_cpu(cpu) {
@@ -2223,8 +2213,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2223 return count; 2213 return count;
2224 2214
2225err_unlock: 2215err_unlock:
2226 mutex_unlock(&tracing_cpumask_update_lock); 2216 free_cpumask_var(tracing_cpumask_new);
2227 free_cpumask_var(tracing_cpumask);
2228 2217
2229 return err; 2218 return err;
2230} 2219}
@@ -3626,7 +3615,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3626 struct trace_seq *s; 3615 struct trace_seq *s;
3627 unsigned long cnt; 3616 unsigned long cnt;
3628 3617
3629 s = kmalloc(sizeof(*s), GFP_ATOMIC); 3618 s = kmalloc(sizeof(*s), GFP_KERNEL);
3630 if (!s) 3619 if (!s)
3631 return ENOMEM; 3620 return ENOMEM;
3632 3621
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..3548ae5cc780 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter)
597 597
598extern struct pid *ftrace_pid_trace; 598extern struct pid *ftrace_pid_trace;
599 599
600#ifdef CONFIG_FUNCTION_TRACER
600static inline int ftrace_trace_task(struct task_struct *task) 601static inline int ftrace_trace_task(struct task_struct *task)
601{ 602{
602 if (!ftrace_pid_trace) 603 if (!ftrace_pid_trace)
@@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
604 605
605 return test_tsk_trace_trace(task); 606 return test_tsk_trace_trace(task);
606} 607}
608#else
609static inline int ftrace_trace_task(struct task_struct *task)
610{
611 return 1;
612}
613#endif
607 614
608/* 615/*
609 * trace_iterator_flags is an enumeration that defines bit 616 * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..53c8fd376a88 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 300
301static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
302{ 302{
303 struct ftrace_event_call *call = NULL;
304 loff_t l;
305
303 mutex_lock(&event_mutex); 306 mutex_lock(&event_mutex);
304 if (*pos == 0) 307
305 m->private = ftrace_events.next; 308 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 309 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l);
311 if (!call)
312 break;
313 }
314 return call;
307} 315}
308 316
309static void * 317static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 340
333static void *s_start(struct seq_file *m, loff_t *pos) 341static void *s_start(struct seq_file *m, loff_t *pos)
334{ 342{
343 struct ftrace_event_call *call = NULL;
344 loff_t l;
345
335 mutex_lock(&event_mutex); 346 mutex_lock(&event_mutex);
336 if (*pos == 0) 347
337 m->private = ftrace_events.next; 348 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 349 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l);
351 if (!call)
352 break;
353 }
354 return call;
339} 355}
340 356
341static int t_show(struct seq_file *m, void *v) 357static int t_show(struct seq_file *m, void *v)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index db6e54bdb596..936c621bbf46 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,8 +27,6 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30static DEFINE_MUTEX(filter_mutex);
31
32enum filter_op_ids 30enum filter_op_ids
33{ 31{
34 OP_OR, 32 OP_OR,
@@ -178,7 +176,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
178static int filter_pred_strloc(struct filter_pred *pred, void *event, 176static int filter_pred_strloc(struct filter_pred *pred, void *event,
179 int val1, int val2) 177 int val1, int val2)
180{ 178{
181 int str_loc = *(int *)(event + pred->offset); 179 unsigned short str_loc = *(unsigned short *)(event + pred->offset);
182 char *addr = (char *)(event + str_loc); 180 char *addr = (char *)(event + str_loc);
183 int cmp, match; 181 int cmp, match;
184 182
@@ -294,12 +292,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
294{ 292{
295 struct event_filter *filter = call->filter; 293 struct event_filter *filter = call->filter;
296 294
297 mutex_lock(&filter_mutex); 295 mutex_lock(&event_mutex);
298 if (filter->filter_string) 296 if (filter->filter_string)
299 trace_seq_printf(s, "%s\n", filter->filter_string); 297 trace_seq_printf(s, "%s\n", filter->filter_string);
300 else 298 else
301 trace_seq_printf(s, "none\n"); 299 trace_seq_printf(s, "none\n");
302 mutex_unlock(&filter_mutex); 300 mutex_unlock(&event_mutex);
303} 301}
304 302
305void print_subsystem_event_filter(struct event_subsystem *system, 303void print_subsystem_event_filter(struct event_subsystem *system,
@@ -307,12 +305,12 @@ void print_subsystem_event_filter(struct event_subsystem *system,
307{ 305{
308 struct event_filter *filter = system->filter; 306 struct event_filter *filter = system->filter;
309 307
310 mutex_lock(&filter_mutex); 308 mutex_lock(&event_mutex);
311 if (filter->filter_string) 309 if (filter->filter_string)
312 trace_seq_printf(s, "%s\n", filter->filter_string); 310 trace_seq_printf(s, "%s\n", filter->filter_string);
313 else 311 else
314 trace_seq_printf(s, "none\n"); 312 trace_seq_printf(s, "none\n");
315 mutex_unlock(&filter_mutex); 313 mutex_unlock(&event_mutex);
316} 314}
317 315
318static struct ftrace_event_field * 316static struct ftrace_event_field *
@@ -381,6 +379,7 @@ void destroy_preds(struct ftrace_event_call *call)
381 filter_free_pred(filter->preds[i]); 379 filter_free_pred(filter->preds[i]);
382 } 380 }
383 kfree(filter->preds); 381 kfree(filter->preds);
382 kfree(filter->filter_string);
384 kfree(filter); 383 kfree(filter);
385 call->filter = NULL; 384 call->filter = NULL;
386} 385}
@@ -433,7 +432,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
433 filter->n_preds = 0; 432 filter->n_preds = 0;
434 } 433 }
435 434
436 mutex_lock(&event_mutex);
437 list_for_each_entry(call, &ftrace_events, list) { 435 list_for_each_entry(call, &ftrace_events, list) {
438 if (!call->define_fields) 436 if (!call->define_fields)
439 continue; 437 continue;
@@ -443,7 +441,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
443 remove_filter_string(call->filter); 441 remove_filter_string(call->filter);
444 } 442 }
445 } 443 }
446 mutex_unlock(&event_mutex);
447} 444}
448 445
449static int filter_add_pred_fn(struct filter_parse_state *ps, 446static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -546,6 +543,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
546 filter_pred_fn_t fn; 543 filter_pred_fn_t fn;
547 unsigned long long val; 544 unsigned long long val;
548 int string_type; 545 int string_type;
546 int ret;
549 547
550 pred->fn = filter_pred_none; 548 pred->fn = filter_pred_none;
551 549
@@ -581,7 +579,11 @@ static int filter_add_pred(struct filter_parse_state *ps,
581 pred->not = 1; 579 pred->not = 1;
582 return filter_add_pred_fn(ps, call, pred, fn); 580 return filter_add_pred_fn(ps, call, pred, fn);
583 } else { 581 } else {
584 if (strict_strtoull(pred->str_val, 0, &val)) { 582 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val);
584 else
585 ret = strict_strtoull(pred->str_val, 0, &val);
586 if (ret) {
585 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 587 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
586 return -EINVAL; 588 return -EINVAL;
587 } 589 }
@@ -625,7 +627,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
625 filter->preds[filter->n_preds] = pred; 627 filter->preds[filter->n_preds] = pred;
626 filter->n_preds++; 628 filter->n_preds++;
627 629
628 mutex_lock(&event_mutex);
629 list_for_each_entry(call, &ftrace_events, list) { 630 list_for_each_entry(call, &ftrace_events, list) {
630 631
631 if (!call->define_fields) 632 if (!call->define_fields)
@@ -636,14 +637,12 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
636 637
637 err = filter_add_pred(ps, call, pred); 638 err = filter_add_pred(ps, call, pred);
638 if (err) { 639 if (err) {
639 mutex_unlock(&event_mutex);
640 filter_free_subsystem_preds(system); 640 filter_free_subsystem_preds(system);
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
642 goto out; 642 goto out;
643 } 643 }
644 replace_filter_string(call->filter, filter_string); 644 replace_filter_string(call->filter, filter_string);
645 } 645 }
646 mutex_unlock(&event_mutex);
647out: 646out:
648 return err; 647 return err;
649} 648}
@@ -1070,12 +1069,12 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1070 1069
1071 struct filter_parse_state *ps; 1070 struct filter_parse_state *ps;
1072 1071
1073 mutex_lock(&filter_mutex); 1072 mutex_lock(&event_mutex);
1074 1073
1075 if (!strcmp(strstrip(filter_string), "0")) { 1074 if (!strcmp(strstrip(filter_string), "0")) {
1076 filter_disable_preds(call); 1075 filter_disable_preds(call);
1077 remove_filter_string(call->filter); 1076 remove_filter_string(call->filter);
1078 mutex_unlock(&filter_mutex); 1077 mutex_unlock(&event_mutex);
1079 return 0; 1078 return 0;
1080 } 1079 }
1081 1080
@@ -1103,7 +1102,7 @@ out:
1103 postfix_clear(ps); 1102 postfix_clear(ps);
1104 kfree(ps); 1103 kfree(ps);
1105out_unlock: 1104out_unlock:
1106 mutex_unlock(&filter_mutex); 1105 mutex_unlock(&event_mutex);
1107 1106
1108 return err; 1107 return err;
1109} 1108}
@@ -1115,12 +1114,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1115 1114
1116 struct filter_parse_state *ps; 1115 struct filter_parse_state *ps;
1117 1116
1118 mutex_lock(&filter_mutex); 1117 mutex_lock(&event_mutex);
1119 1118
1120 if (!strcmp(strstrip(filter_string), "0")) { 1119 if (!strcmp(strstrip(filter_string), "0")) {
1121 filter_free_subsystem_preds(system); 1120 filter_free_subsystem_preds(system);
1122 remove_filter_string(system->filter); 1121 remove_filter_string(system->filter);
1123 mutex_unlock(&filter_mutex); 1122 mutex_unlock(&event_mutex);
1124 return 0; 1123 return 0;
1125 } 1124 }
1126 1125
@@ -1148,7 +1147,7 @@ out:
1148 postfix_clear(ps); 1147 postfix_clear(ps);
1149 kfree(ps); 1148 kfree(ps);
1150out_unlock: 1149out_unlock:
1151 mutex_unlock(&filter_mutex); 1150 mutex_unlock(&event_mutex);
1152 1151
1153 return err; 1152 return err;
1154} 1153}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..7402144bff21 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
193static void tracing_stop_function_trace(void) 193static void tracing_stop_function_trace(void)
194{ 194{
195 ftrace_function_enabled = 0; 195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */ 196
197 unregister_ftrace_function(&trace_stack_ops); 197 if (func_flags.val & TRACE_FUNC_OPT_STACK)
198 unregister_ftrace_function(&trace_ops); 198 unregister_ftrace_function(&trace_stack_ops);
199 else
200 unregister_ftrace_function(&trace_ops);
199} 201}
200 202
201static int func_set_flag(u32 old_flags, u32 bit, int set) 203static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
300 if (count == -1) 302 if (count == -1)
301 seq_printf(m, ":unlimited\n"); 303 seq_printf(m, ":unlimited\n");
302 else 304 else
303 seq_printf(m, ":count=%ld", count); 305 seq_printf(m, ":count=%ld\n", count);
304 seq_putc(m, '\n');
305 306
306 return 0; 307 return 0;
307} 308}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b2..d2249abafb53 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
59int 59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
61 unsigned long frame_pointer)
61{ 62{
62 unsigned long long calltime; 63 unsigned long long calltime;
63 int index; 64 int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
85 current->ret_stack[index].func = func; 86 current->ret_stack[index].func = func;
86 current->ret_stack[index].calltime = calltime; 87 current->ret_stack[index].calltime = calltime;
87 current->ret_stack[index].subtime = 0; 88 current->ret_stack[index].subtime = 0;
89 current->ret_stack[index].fp = frame_pointer;
88 *depth = index; 90 *depth = index;
89 91
90 return 0; 92 return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
92 94
93/* Retrieve a function return address to the trace stack on thread info.*/ 95/* Retrieve a function return address to the trace stack on thread info.*/
94static void 96static void
95ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 97ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
98 unsigned long frame_pointer)
96{ 99{
97 int index; 100 int index;
98 101
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
106 return; 109 return;
107 } 110 }
108 111
112#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
113 /*
114 * The arch may choose to record the frame pointer used
115 * and check it here to make sure that it is what we expect it
116 * to be. If gcc does not set the place holder of the return
117 * address in the frame pointer, and does a copy instead, then
118 * the function graph trace will fail. This test detects this
119 * case.
120 *
121 * Currently, x86_32 with optimize for size (-Os) makes the latest
122 * gcc do the above.
123 */
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n",
128 current->ret_stack[index].fp,
129 frame_pointer,
130 (void *)current->ret_stack[index].func,
131 current->ret_stack[index].ret);
132 *ret = (unsigned long)panic;
133 return;
134 }
135#endif
136
109 *ret = current->ret_stack[index].ret; 137 *ret = current->ret_stack[index].ret;
110 trace->func = current->ret_stack[index].func; 138 trace->func = current->ret_stack[index].func;
111 trace->calltime = current->ret_stack[index].calltime; 139 trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
117 * Send the trace to the ring-buffer. 145 * Send the trace to the ring-buffer.
118 * @return the original return address. 146 * @return the original return address.
119 */ 147 */
120unsigned long ftrace_return_to_handler(void) 148unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
121{ 149{
122 struct ftrace_graph_ret trace; 150 struct ftrace_graph_ret trace;
123 unsigned long ret; 151 unsigned long ret;
124 152
125 ftrace_pop_return_trace(&trace, &ret); 153 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
126 trace.rettime = trace_clock_local(); 154 trace.rettime = trace_clock_local();
127 ftrace_graph_return(&trace); 155 ftrace_graph_return(&trace);
128 barrier(); 156 barrier();
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..7b6278110827 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..e66f5e493342 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
199 mutex_lock(&session->stat_mutex); 199 mutex_lock(&session->stat_mutex);
200 200
201 /* If we are in the beginning of the file, print the headers */ 201 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 202 if (!*pos && session->ts->stat_headers)
203 (*pos)++;
204 return SEQ_START_TOKEN; 203 return SEQ_START_TOKEN;
205 }
206 204
207 node = rb_first(&session->stat_root); 205 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 206 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node); 207 node = rb_next(node);
210 208
211 (*pos)++;
212
213 return node; 209 return node;
214} 210}
215 211