aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-08-15 12:55:58 -0400
committerIngo Molnar <mingo@elte.hu>2009-08-15 12:56:13 -0400
commitfa08661af834875c9bd6f7f0b1b9388dc72a6585 (patch)
treec381fcfcfeb38515bfa93445c80ad9231343414d /kernel
parent240ebbf81f149b11a31e060ebe5ee51a3c775360 (diff)
parent64f1607ffbbc772685733ea63e6f7f4183df1b16 (diff)
Merge commit 'v2.6.31-rc6' into core/rcu
Merge reason: the branch was on pre-rc1 .30, update to latest. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c66
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c151
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c74
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hrtimer.c110
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c72
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c9
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c1088
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c61
-rw-r--r--kernel/sched_cpupri.c15
-rw-r--r--kernel/sched_fair.c45
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/time/clockevents.c11
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c4
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c13
-rw-r--r--kernel/trace/ftrace.c84
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c37
-rw-r--r--kernel/trace/trace.h11
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c32
-rw-r--r--kernel/trace/trace_events_filter.c20
-rw-r--r--kernel/trace/trace_functions.c5
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c28
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/trace/trace_stat.c40
-rw-r--r--kernel/wait.c5
63 files changed, 2244 insertions, 1340 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d6042bcc9e99..2419c9d43918 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/ 74obj-$(CONFIG_GCOV_KERNEL) += gcov/
75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -95,6 +95,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
95obj-$(CONFIG_FUNCTION_TRACER) += trace/ 95obj-$(CONFIG_FUNCTION_TRACER) += trace/
96obj-$(CONFIG_TRACING) += trace/ 96obj-$(CONFIG_TRACING) += trace/
97obj-$(CONFIG_X86_DS) += trace/ 97obj-$(CONFIG_X86_DS) += trace/
98obj-$(CONFIG_RING_BUFFER) += trace/
98obj-$(CONFIG_SMP) += sched_cpupri.o 99obj-$(CONFIG_SMP) += sched_cpupri.o
99obj-$(CONFIG_SLOW_WORK) += slow-work.o 100obj-$(CONFIG_SLOW_WORK) += slow-work.o
100obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a682cdf5..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 736 * to zero, soon.
736 * 737 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 739 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 741
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 743{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
745} 746}
746 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
747static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 761 unsigned long final_bits)
749{ 762{
@@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
960 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
961 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
962 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
963 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
964} 978}
965static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1357 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1358 * is no longer empty. 1372 * is no longer empty.
1359 */ 1373 */
1360 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1361 return 0; 1375 return 0;
1362} 1376}
1363 1377
@@ -2201,12 +2215,30 @@ err:
2201 return ret; 2215 return ret;
2202} 2216}
2203 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2204static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2205{ 2238{
2206 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2207} 2240}
2208 2241
2209
2210/* 2242/*
2211 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2212 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2221 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2222 * next pid to display, if any 2254 * next pid to display, if any
2223 */ 2255 */
2224 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2225 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2226 int *iter; 2259 int *iter;
2227 2260
2228 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2229 if (pid) { 2262 if (pid) {
2230 int end = cgrp->pids_length; 2263 int end = cp->length;
2231 2264
2232 while (index < end) { 2265 while (index < end) {
2233 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2234 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2235 index = mid; 2268 index = mid;
2236 break; 2269 break;
2237 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2238 index = mid + 1; 2271 index = mid + 1;
2239 else 2272 else
2240 end = mid; 2273 end = mid;
2241 } 2274 }
2242 } 2275 }
2243 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2244 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2245 return NULL; 2278 return NULL;
2246 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2247 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2248 *pos = *iter; 2281 *pos = *iter;
2249 return iter; 2282 return iter;
2250} 2283}
2251 2284
2252static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2253{ 2286{
2254 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2255 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2256} 2290}
2257 2291
2258static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2259{ 2293{
2260 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2261 int *p = v; 2295 int *p = v;
2262 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2263 2297
2264 /* 2298 /*
2265 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2286 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2287}; 2321};
2288 2322
2289static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2290{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2291 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2292 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2293 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2294 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2295 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2296 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2297 } 2334 }
2298 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2299} 2336}
2300 2337
2301static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2302{ 2339{
2303 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2304 2342
2305 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2306 return 0; 2344 return 0;
2307 2345
2308 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2309 return seq_release(inode, file); 2350 return seq_release(inode, file);
2310} 2351}
2311 2352
@@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2324static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2325{ 2366{
2326 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2327 pid_t *pidarray; 2370 pid_t *pidarray;
2328 int npids; 2371 int npids;
2329 int retval; 2372 int retval;
@@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2350 * array if necessary 2393 * array if necessary
2351 */ 2394 */
2352 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2353 kfree(cgrp->tasks_pids); 2396
2354 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2355 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2356 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2357 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2358 2418
2359 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2360 2420
2361 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2362 if (retval) { 2422 if (retval) {
2363 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2364 return retval; 2424 return retval;
2365 } 2425 }
2366 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2367 return 0; 2427 return 0;
2368} 2428}
2369 2429
@@ -2696,33 +2756,42 @@ again:
2696 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2697 2757
2698 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2699 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2700 * that rmdir() request comes. 2771 * that rmdir() request comes.
2701 */ 2772 */
2702 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2703 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 return ret; 2776 return ret;
2777 }
2705 2778
2706 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2707 parent = cgrp->parent; 2780 parent = cgrp->parent;
2708 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2709 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2710 return -EBUSY; 2784 return -EBUSY;
2711 } 2785 }
2712 /*
2713 * css_put/get is provided for subsys to grab refcnt to css. In typical
2714 * case, subsystem has no reference after pre_destroy(). But, under
2715 * hierarchy management, some *temporal* refcnt can be hold.
2716 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2717 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2718 * is called when css_put() is called and refcnt goes down to 0.
2719 */
2720 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2721 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2722
2723 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2724 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2725 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2726 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2727 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2728 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3294 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3295 check_for_release(cgrp); 3364 check_for_release(cgrp);
3296 } 3365 }
3297 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3298 } 3367 }
3299 rcu_read_unlock(); 3368 rcu_read_unlock();
3300} 3369}
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..021e1138556e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -427,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
427 init_rwsem(&mm->mmap_sem); 426 init_rwsem(&mm->mmap_sem);
428 INIT_LIST_HEAD(&mm->mmlist); 427 INIT_LIST_HEAD(&mm->mmlist);
429 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 428 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
429 mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0;
430 mm->core_state = NULL; 430 mm->core_state = NULL;
431 mm->nr_ptes = 0; 431 mm->nr_ptes = 0;
432 set_mm_counter(mm, file_rss, 0); 432 set_mm_counter(mm, file_rss, 0);
@@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 568 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 569 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 570 */
571 if (tsk->clear_child_tid 571 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 572 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 573 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 574 /*
575 * We don't check the error code - if userspace has
576 * not set up a proper pointer then tough luck.
577 */
578 put_user(0, tsk->clear_child_tid);
579 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
580 1, NULL, NULL, 0);
581 }
575 tsk->clear_child_tid = NULL; 582 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 583 }
584} 584}
585 585
@@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1269 write_unlock_irq(&tasklist_lock); 1269 write_unlock_irq(&tasklist_lock);
1270 proc_fork_connector(p); 1270 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1271 cgroup_post_fork(p);
1272 perf_counter_fork(p);
1272 return p; 1273 return p;
1273 1274
1274bad_fork_free_pid: 1275bad_fork_free_pid:
@@ -1408,12 +1409,6 @@ long do_fork(unsigned long clone_flags,
1408 if (clone_flags & CLONE_VFORK) { 1409 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1410 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1411 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1417 } 1412 }
1418 1413
1419 audit_finish_fork(p); 1414 audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..e18cfbdc7190 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
247 if (err < 0) 247 if (err < 0)
248 return err; 248 return err;
249 249
250 page = compound_head(page);
250 lock_page(page); 251 lock_page(page);
251 if (!page->mapping) { 252 if (!page->mapping) {
252 unlock_page(page); 253 unlock_page(page);
@@ -284,6 +285,25 @@ void put_futex_key(int fshared, union futex_key *key)
284 drop_futex_key_refs(key); 285 drop_futex_key_refs(key);
285} 286}
286 287
288/*
289 * fault_in_user_writeable - fault in user address and verify RW access
290 * @uaddr: pointer to faulting user space address
291 *
292 * Slow path to fixup the fault we just took in the atomic write
293 * access to @uaddr.
294 *
295 * We have no generic implementation of a non destructive write to the
296 * user address. We know that we faulted in the atomic pagefault
297 * disabled section so we can as well avoid the #PF overhead by
298 * calling get_user_pages() right away.
299 */
300static int fault_in_user_writeable(u32 __user *uaddr)
301{
302 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
303 1, 1, 0, NULL, NULL);
304 return ret < 0 ? ret : 0;
305}
306
287/** 307/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex 308 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in 309 * @hb: the hash bucket the futex_q's reside in
@@ -896,7 +916,6 @@ retry:
896retry_private: 916retry_private:
897 op_ret = futex_atomic_op_inuser(op, uaddr2); 917 op_ret = futex_atomic_op_inuser(op, uaddr2);
898 if (unlikely(op_ret < 0)) { 918 if (unlikely(op_ret < 0)) {
899 u32 dummy;
900 919
901 double_unlock_hb(hb1, hb2); 920 double_unlock_hb(hb1, hb2);
902 921
@@ -914,7 +933,7 @@ retry_private:
914 goto out_put_keys; 933 goto out_put_keys;
915 } 934 }
916 935
917 ret = get_user(dummy, uaddr2); 936 ret = fault_in_user_writeable(uaddr2);
918 if (ret) 937 if (ret)
919 goto out_put_keys; 938 goto out_put_keys;
920 939
@@ -991,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1010 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q 1011 * q: the futex_q
993 * key: the key of the requeue target futex 1012 * key: the key of the requeue target futex
1013 * hb: the hash_bucket of the requeue target futex
994 * 1014 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1015 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1016 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right 1017 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1018 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held. 1019 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1020 * to protect access to the pi_state to fixup the owner later. Must be called
1021 * with both q->lock_ptr and hb->lock held.
1000 */ 1022 */
1001static inline 1023static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) 1024void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1025 struct futex_hash_bucket *hb)
1003{ 1026{
1004 drop_futex_key_refs(&q->key); 1027 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key); 1028 get_futex_key_refs(key);
@@ -1011,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1011 WARN_ON(!q->rt_waiter); 1034 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL; 1035 q->rt_waiter = NULL;
1013 1036
1037 q->lock_ptr = &hb->lock;
1038#ifdef CONFIG_DEBUG_PI_LIST
1039 q->list.plist.lock = &hb->lock;
1040#endif
1041
1014 wake_up_state(q->task, TASK_NORMAL); 1042 wake_up_state(q->task, TASK_NORMAL);
1015} 1043}
1016 1044
@@ -1069,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1097 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters); 1098 set_waiters);
1071 if (ret == 1) 1099 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2); 1100 requeue_pi_wake_futex(top_waiter, key2, hb2);
1073 1101
1074 return ret; 1102 return ret;
1075} 1103}
@@ -1204,7 +1232,7 @@ retry_private:
1204 double_unlock_hb(hb1, hb2); 1232 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2); 1233 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1); 1234 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2); 1235 ret = fault_in_user_writeable(uaddr2);
1208 if (!ret) 1236 if (!ret)
1209 goto retry; 1237 goto retry;
1210 goto out; 1238 goto out;
@@ -1228,8 +1256,15 @@ retry_private:
1228 if (!match_futex(&this->key, &key1)) 1256 if (!match_futex(&this->key, &key1))
1229 continue; 1257 continue;
1230 1258
1231 WARN_ON(!requeue_pi && this->rt_waiter); 1259 /*
1232 WARN_ON(requeue_pi && !this->rt_waiter); 1260 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1261 * be paired with each other and no other futex ops.
1262 */
1263 if ((requeue_pi && !this->rt_waiter) ||
1264 (!requeue_pi && this->rt_waiter)) {
1265 ret = -EINVAL;
1266 break;
1267 }
1233 1268
1234 /* 1269 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1270 * Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1254,7 +1289,7 @@ retry_private:
1254 this->task, 1); 1289 this->task, 1);
1255 if (ret == 1) { 1290 if (ret == 1) {
1256 /* We got the lock. */ 1291 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2); 1292 requeue_pi_wake_futex(this, &key2, hb2);
1258 continue; 1293 continue;
1259 } else if (ret) { 1294 } else if (ret) {
1260 /* -EDEADLK */ 1295 /* -EDEADLK */
@@ -1482,7 +1517,7 @@ retry:
1482handle_fault: 1517handle_fault:
1483 spin_unlock(q->lock_ptr); 1518 spin_unlock(q->lock_ptr);
1484 1519
1485 ret = get_user(uval, uaddr); 1520 ret = fault_in_user_writeable(uaddr);
1486 1521
1487 spin_lock(q->lock_ptr); 1522 spin_lock(q->lock_ptr);
1488 1523
@@ -1807,7 +1842,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1807{ 1842{
1808 struct hrtimer_sleeper timeout, *to = NULL; 1843 struct hrtimer_sleeper timeout, *to = NULL;
1809 struct futex_hash_bucket *hb; 1844 struct futex_hash_bucket *hb;
1810 u32 uval;
1811 struct futex_q q; 1845 struct futex_q q;
1812 int res, ret; 1846 int res, ret;
1813 1847
@@ -1909,16 +1943,9 @@ out:
1909 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1943 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910 1944
1911uaddr_faulted: 1945uaddr_faulted:
1912 /*
1913 * We have to r/w *(int __user *)uaddr, and we have to modify it
1914 * atomically. Therefore, if we continue to fault after get_user()
1915 * below, we need to handle the fault ourselves, while still holding
1916 * the mmap_sem. This can occur if the uaddr is under contention as
1917 * we have to drop the mmap_sem in order to call get_user().
1918 */
1919 queue_unlock(&q, hb); 1946 queue_unlock(&q, hb);
1920 1947
1921 ret = get_user(uval, uaddr); 1948 ret = fault_in_user_writeable(uaddr);
1922 if (ret) 1949 if (ret)
1923 goto out_put_key; 1950 goto out_put_key;
1924 1951
@@ -2013,17 +2040,10 @@ out:
2013 return ret; 2040 return ret;
2014 2041
2015pi_faulted: 2042pi_faulted:
2016 /*
2017 * We have to r/w *(int __user *)uaddr, and we have to modify it
2018 * atomically. Therefore, if we continue to fault after get_user()
2019 * below, we need to handle the fault ourselves, while still holding
2020 * the mmap_sem. This can occur if the uaddr is under contention as
2021 * we have to drop the mmap_sem in order to call get_user().
2022 */
2023 spin_unlock(&hb->lock); 2043 spin_unlock(&hb->lock);
2024 put_futex_key(fshared, &key); 2044 put_futex_key(fshared, &key);
2025 2045
2026 ret = get_user(uval, uaddr); 2046 ret = fault_in_user_writeable(uaddr);
2027 if (!ret) 2047 if (!ret)
2028 goto retry; 2048 goto retry;
2029 2049
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9002958a96e7..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -1276,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1276 1282
1277 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1278 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1279 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1280 1296
1281 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1282 ktime_t basenow; 1298 ktime_t basenow;
1283 struct rb_node *node; 1299 struct rb_node *node;
1284 1300
1285 spin_lock(&cpu_base->lock);
1286
1287 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1288 1302
1289 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1316,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1316 1330
1317 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1318 } 1332 }
1319 spin_unlock(&cpu_base->lock);
1320 base++; 1333 base++;
1321 } 1334 }
1322 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1323 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1324 1342
1325 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1326 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47 46
48/* 47/*
49 * Debugging printout: 48 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50da67672901..d222515a5a06 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -443,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
443 return -1; 451 return -1;
444} 452}
445 453
454#ifdef CONFIG_SMP
455/*
456 * Check whether we need to change the affinity of the interrupt thread.
457 */
458static void
459irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
460{
461 cpumask_var_t mask;
462
463 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
464 return;
465
466 /*
467 * In case we are out of memory we set IRQTF_AFFINITY again and
468 * try again next time
469 */
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
471 set_bit(IRQTF_AFFINITY, &action->thread_flags);
472 return;
473 }
474
475 spin_lock_irq(&desc->lock);
476 cpumask_copy(mask, desc->affinity);
477 spin_unlock_irq(&desc->lock);
478
479 set_cpus_allowed_ptr(current, mask);
480 free_cpumask_var(mask);
481}
482#else
483static inline void
484irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
485#endif
486
446/* 487/*
447 * Interrupt handler thread 488 * Interrupt handler thread
448 */ 489 */
@@ -458,6 +499,8 @@ static int irq_thread(void *data)
458 499
459 while (!irq_wait_for_interrupt(action)) { 500 while (!irq_wait_for_interrupt(action)) {
460 501
502 irq_thread_check_affinity(desc, action);
503
461 atomic_inc(&desc->threads_active); 504 atomic_inc(&desc->threads_active);
462 505
463 spin_lock_irq(&desc->lock); 506 spin_lock_irq(&desc->lock);
@@ -718,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
718{ 761{
719 struct irq_desc *desc = irq_to_desc(irq); 762 struct irq_desc *desc = irq_to_desc(irq);
720 struct irqaction *action, **action_ptr; 763 struct irqaction *action, **action_ptr;
721 struct task_struct *irqthread;
722 unsigned long flags; 764 unsigned long flags;
723 765
724 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 766 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -766,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
766 desc->chip->disable(irq); 808 desc->chip->disable(irq);
767 } 809 }
768 810
769 irqthread = action->thread;
770 action->thread = NULL;
771
772 spin_unlock_irqrestore(&desc->lock, flags); 811 spin_unlock_irqrestore(&desc->lock, flags);
773 812
774 unregister_handler_proc(irq, action); 813 unregister_handler_proc(irq, action);
@@ -776,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
776 /* Make sure it's not being used on another CPU: */ 815 /* Make sure it's not being used on another CPU: */
777 synchronize_irq(irq); 816 synchronize_irq(irq);
778 817
779 if (irqthread) {
780 if (!test_bit(IRQTF_DIED, &action->thread_flags))
781 kthread_stop(irqthread);
782 put_task_struct(irqthread);
783 }
784
785#ifdef CONFIG_DEBUG_SHIRQ 818#ifdef CONFIG_DEBUG_SHIRQ
786 /* 819 /*
787 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 820 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -797,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
797 local_irq_restore(flags); 830 local_irq_restore(flags);
798 } 831 }
799#endif 832#endif
833
834 if (action->thread) {
835 if (!test_bit(IRQTF_DIED, &action->thread_flags))
836 kthread_stop(action->thread);
837 put_task_struct(action->thread);
838 }
839
800 return action; 840 return action;
801} 841}
802 842
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
107 107
108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
109{ 109{
110 /* those all static, do move them */ 110 /* those static or target node is -1, do not move them */
111 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
112 return desc; 112 return desc;
113 113
114 if (desc->node != node) 114 if (desc->node != node)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..0540948e29ab 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
237{ 237{
238 struct kprobe_insn_page *kip; 238 struct kprobe_insn_page *kip;
239 struct hlist_node *pos, *next; 239 struct hlist_node *pos, *next;
240 int safety;
241 240
242 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 242 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 243 return -EAGAIN;
248 244
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 694 p->addr = addr;
699 695
700 preempt_disable(); 696 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 697 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 698 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 699 preempt_enable();
704 return -EINVAL; 700 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9b1a7de26979..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind);
180 * @k: thread created by kthread_create(). 180 * @k: thread created by kthread_create().
181 * 181 *
182 * Sets kthread_should_stop() for @k to return true, wakes it, and 182 * Sets kthread_should_stop() for @k to return true, wakes it, and
183 * waits for it to exit. Your threadfn() must not call do_exit() 183 * waits for it to exit. This can also be called after kthread_create()
184 * itself if you use this function! This can also be called after 184 * instead of calling wake_up_process(): the thread will exit without
185 * kthread_create() instead of calling wake_up_process(): the thread 185 * calling threadfn().
186 * will exit without calling threadfn(). 186 *
187 * If threadfn() may call do_exit() itself, the caller must ensure
188 * task_struct can't go away.
187 * 189 *
188 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 190 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
189 * was never called. 191 * was never called.
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..e94caa666dba 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 758 &proc_lockdep_stats_operations);
759 759
760#ifdef CONFIG_LOCK_STAT 760#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 761 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
762 &proc_lock_stat_operations);
762#endif 763#endif
763 764
764 return 0; 765 return 0;
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..fd1411403558 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1068{
1069 const unsigned long *crc; 1069 const unsigned long *crc;
1070 1070
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1071 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1072 &crc, true, false))
1072 BUG(); 1073 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1074 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1075}
@@ -2451,9 +2452,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2451 return ret; 2452 return ret;
2452 } 2453 }
2453 if (ret > 0) { 2454 if (ret > 0) {
2454 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2455 printk(KERN_WARNING
2455 "it should follow 0/-E convention\n" 2456"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2456 KERN_WARNING "%s: loading module anyway...\n", 2457"%s: loading module anyway...\n",
2457 __func__, mod->name, ret, 2458 __func__, mod->name, ret,
2458 __func__); 2459 __func__);
2459 dump_stack(); 2460 dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea4..534e20d14d63 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
@@ -87,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 88void __weak hw_perf_enable(void) { barrier(); }
88 89
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 92
91int __weak 93int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 94hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -146,6 +148,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 148 }
147} 149}
148 150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
149/* 173/*
150 * Get the perf_counter_context for a task and lock it. 174 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 175 * This has to cope with with the fact that until it is locked,
@@ -236,6 +260,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
236 260
237 list_add_rcu(&counter->event_entry, &ctx->event_list); 261 list_add_rcu(&counter->event_entry, &ctx->event_list);
238 ctx->nr_counters++; 262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
239} 265}
240 266
241/* 267/*
@@ -250,6 +276,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
250 if (list_empty(&counter->list_entry)) 276 if (list_empty(&counter->list_entry))
251 return; 277 return;
252 ctx->nr_counters--; 278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
253 281
254 list_del_init(&counter->list_entry); 282 list_del_init(&counter->list_entry);
255 list_del_rcu(&counter->event_entry); 283 list_del_rcu(&counter->event_entry);
@@ -279,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
279 return; 307 return;
280 308
281 counter->state = PERF_COUNTER_STATE_INACTIVE; 309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
282 counter->tstamp_stopped = ctx->time; 314 counter->tstamp_stopped = ctx->time;
283 counter->pmu->disable(counter); 315 counter->pmu->disable(counter);
284 counter->oncpu = -1; 316 counter->oncpu = -1;
@@ -1006,6 +1038,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1006 && !ctx1->pin_count && !ctx2->pin_count; 1038 && !ctx1->pin_count && !ctx2->pin_count;
1007} 1039}
1008 1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1009/* 1116/*
1010 * Called from scheduler to remove the counters of the current task, 1117 * Called from scheduler to remove the counters of the current task,
1011 * with interrupts disabled. 1118 * with interrupts disabled.
@@ -1061,6 +1168,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1061 ctx->task = next; 1168 ctx->task = next;
1062 next_ctx->task = task; 1169 next_ctx->task = task;
1063 do_switch = 0; 1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1064 } 1173 }
1065 spin_unlock(&next_ctx->lock); 1174 spin_unlock(&next_ctx->lock);
1066 spin_unlock(&ctx->lock); 1175 spin_unlock(&ctx->lock);
@@ -1207,7 +1316,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1207#define MAX_INTERRUPTS (~0ULL) 1316#define MAX_INTERRUPTS (~0ULL)
1208 1317
1209static void perf_log_throttle(struct perf_counter *counter, int enable); 1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1210static void perf_log_period(struct perf_counter *counter, u64 period);
1211 1319
1212static void perf_adjust_period(struct perf_counter *counter, u64 events) 1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1213{ 1321{
@@ -1226,8 +1334,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1226 if (!sample_period) 1334 if (!sample_period)
1227 sample_period = 1; 1335 sample_period = 1;
1228 1336
1229 perf_log_period(counter, sample_period);
1230
1231 hwc->sample_period = sample_period; 1337 hwc->sample_period = sample_period;
1232} 1338}
1233 1339
@@ -1348,9 +1454,54 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1348} 1454}
1349 1455
1350/* 1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1351 * Cross CPU call to read the hardware counter 1502 * Cross CPU call to read the hardware counter
1352 */ 1503 */
1353static void __read(void *info) 1504static void __perf_counter_read(void *info)
1354{ 1505{
1355 struct perf_counter *counter = info; 1506 struct perf_counter *counter = info;
1356 struct perf_counter_context *ctx = counter->ctx; 1507 struct perf_counter_context *ctx = counter->ctx;
@@ -1372,7 +1523,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1372 */ 1523 */
1373 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1524 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1374 smp_call_function_single(counter->oncpu, 1525 smp_call_function_single(counter->oncpu,
1375 __read, counter, 1); 1526 __perf_counter_read, counter, 1);
1376 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1527 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1377 update_counter_times(counter); 1528 update_counter_times(counter);
1378 } 1529 }
@@ -1398,7 +1549,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1398 1549
1399static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1550static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1400{ 1551{
1401 struct perf_counter_context *parent_ctx;
1402 struct perf_counter_context *ctx; 1552 struct perf_counter_context *ctx;
1403 struct perf_cpu_context *cpuctx; 1553 struct perf_cpu_context *cpuctx;
1404 struct task_struct *task; 1554 struct task_struct *task;
@@ -1458,11 +1608,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1458 retry: 1608 retry:
1459 ctx = perf_lock_task_context(task, &flags); 1609 ctx = perf_lock_task_context(task, &flags);
1460 if (ctx) { 1610 if (ctx) {
1461 parent_ctx = ctx->parent_ctx; 1611 unclone_ctx(ctx);
1462 if (parent_ctx) {
1463 put_ctx(parent_ctx);
1464 ctx->parent_ctx = NULL; /* no longer a clone */
1465 }
1466 spin_unlock_irqrestore(&ctx->lock, flags); 1612 spin_unlock_irqrestore(&ctx->lock, flags);
1467 } 1613 }
1468 1614
@@ -1508,11 +1654,15 @@ static void free_counter(struct perf_counter *counter)
1508{ 1654{
1509 perf_pending_sync(counter); 1655 perf_pending_sync(counter);
1510 1656
1511 atomic_dec(&nr_counters); 1657 if (!counter->parent) {
1512 if (counter->attr.mmap) 1658 atomic_dec(&nr_counters);
1513 atomic_dec(&nr_mmap_counters); 1659 if (counter->attr.mmap)
1514 if (counter->attr.comm) 1660 atomic_dec(&nr_mmap_counters);
1515 atomic_dec(&nr_comm_counters); 1661 if (counter->attr.comm)
1662 atomic_dec(&nr_comm_counters);
1663 if (counter->attr.task)
1664 atomic_dec(&nr_task_counters);
1665 }
1516 1666
1517 if (counter->destroy) 1667 if (counter->destroy)
1518 counter->destroy(counter); 1668 counter->destroy(counter);
@@ -1546,14 +1696,133 @@ static int perf_release(struct inode *inode, struct file *file)
1546 return 0; 1696 return 0;
1547} 1697}
1548 1698
1699static int perf_counter_read_size(struct perf_counter *counter)
1700{
1701 int entry = sizeof(u64); /* value */
1702 int size = 0;
1703 int nr = 1;
1704
1705 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1706 size += sizeof(u64);
1707
1708 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1709 size += sizeof(u64);
1710
1711 if (counter->attr.read_format & PERF_FORMAT_ID)
1712 entry += sizeof(u64);
1713
1714 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1715 nr += counter->group_leader->nr_siblings;
1716 size += sizeof(u64);
1717 }
1718
1719 size += entry * nr;
1720
1721 return size;
1722}
1723
1724static u64 perf_counter_read_value(struct perf_counter *counter)
1725{
1726 struct perf_counter *child;
1727 u64 total = 0;
1728
1729 total += perf_counter_read(counter);
1730 list_for_each_entry(child, &counter->child_list, child_list)
1731 total += perf_counter_read(child);
1732
1733 return total;
1734}
1735
1736static int perf_counter_read_entry(struct perf_counter *counter,
1737 u64 read_format, char __user *buf)
1738{
1739 int n = 0, count = 0;
1740 u64 values[2];
1741
1742 values[n++] = perf_counter_read_value(counter);
1743 if (read_format & PERF_FORMAT_ID)
1744 values[n++] = primary_counter_id(counter);
1745
1746 count = n * sizeof(u64);
1747
1748 if (copy_to_user(buf, values, count))
1749 return -EFAULT;
1750
1751 return count;
1752}
1753
1754static int perf_counter_read_group(struct perf_counter *counter,
1755 u64 read_format, char __user *buf)
1756{
1757 struct perf_counter *leader = counter->group_leader, *sub;
1758 int n = 0, size = 0, err = -EFAULT;
1759 u64 values[3];
1760
1761 values[n++] = 1 + leader->nr_siblings;
1762 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1763 values[n++] = leader->total_time_enabled +
1764 atomic64_read(&leader->child_total_time_enabled);
1765 }
1766 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1767 values[n++] = leader->total_time_running +
1768 atomic64_read(&leader->child_total_time_running);
1769 }
1770
1771 size = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, size))
1774 return -EFAULT;
1775
1776 err = perf_counter_read_entry(leader, read_format, buf + size);
1777 if (err < 0)
1778 return err;
1779
1780 size += err;
1781
1782 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1783 err = perf_counter_read_entry(counter, read_format,
1784 buf + size);
1785 if (err < 0)
1786 return err;
1787
1788 size += err;
1789 }
1790
1791 return size;
1792}
1793
1794static int perf_counter_read_one(struct perf_counter *counter,
1795 u64 read_format, char __user *buf)
1796{
1797 u64 values[4];
1798 int n = 0;
1799
1800 values[n++] = perf_counter_read_value(counter);
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = counter->total_time_enabled +
1803 atomic64_read(&counter->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = counter->total_time_running +
1807 atomic64_read(&counter->child_total_time_running);
1808 }
1809 if (read_format & PERF_FORMAT_ID)
1810 values[n++] = primary_counter_id(counter);
1811
1812 if (copy_to_user(buf, values, n * sizeof(u64)))
1813 return -EFAULT;
1814
1815 return n * sizeof(u64);
1816}
1817
1549/* 1818/*
1550 * Read the performance counter - simple non blocking version for now 1819 * Read the performance counter - simple non blocking version for now
1551 */ 1820 */
1552static ssize_t 1821static ssize_t
1553perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1822perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1554{ 1823{
1555 u64 values[4]; 1824 u64 read_format = counter->attr.read_format;
1556 int n; 1825 int ret;
1557 1826
1558 /* 1827 /*
1559 * Return end-of-file for a read on a counter that is in 1828 * Return end-of-file for a read on a counter that is in
@@ -1563,28 +1832,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1563 if (counter->state == PERF_COUNTER_STATE_ERROR) 1832 if (counter->state == PERF_COUNTER_STATE_ERROR)
1564 return 0; 1833 return 0;
1565 1834
1835 if (count < perf_counter_read_size(counter))
1836 return -ENOSPC;
1837
1566 WARN_ON_ONCE(counter->ctx->parent_ctx); 1838 WARN_ON_ONCE(counter->ctx->parent_ctx);
1567 mutex_lock(&counter->child_mutex); 1839 mutex_lock(&counter->child_mutex);
1568 values[0] = perf_counter_read(counter); 1840 if (read_format & PERF_FORMAT_GROUP)
1569 n = 1; 1841 ret = perf_counter_read_group(counter, read_format, buf);
1570 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1842 else
1571 values[n++] = counter->total_time_enabled + 1843 ret = perf_counter_read_one(counter, read_format, buf);
1572 atomic64_read(&counter->child_total_time_enabled);
1573 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1574 values[n++] = counter->total_time_running +
1575 atomic64_read(&counter->child_total_time_running);
1576 if (counter->attr.read_format & PERF_FORMAT_ID)
1577 values[n++] = counter->id;
1578 mutex_unlock(&counter->child_mutex); 1844 mutex_unlock(&counter->child_mutex);
1579 1845
1580 if (count < n * sizeof(u64)) 1846 return ret;
1581 return -EINVAL;
1582 count = n * sizeof(u64);
1583
1584 if (copy_to_user(buf, values, count))
1585 return -EFAULT;
1586
1587 return count;
1588} 1847}
1589 1848
1590static ssize_t 1849static ssize_t
@@ -1681,8 +1940,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1681 1940
1682 counter->attr.sample_freq = value; 1941 counter->attr.sample_freq = value;
1683 } else { 1942 } else {
1684 perf_log_period(counter, value);
1685
1686 counter->attr.sample_period = value; 1943 counter->attr.sample_period = value;
1687 counter->hw.sample_period = value; 1944 counter->hw.sample_period = value;
1688 } 1945 }
@@ -1751,6 +2008,14 @@ int perf_counter_task_disable(void)
1751 return 0; 2008 return 0;
1752} 2009}
1753 2010
2011static int perf_counter_index(struct perf_counter *counter)
2012{
2013 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2014 return 0;
2015
2016 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2017}
2018
1754/* 2019/*
1755 * Callers need to ensure there can be no nesting of this function, otherwise 2020 * Callers need to ensure there can be no nesting of this function, otherwise
1756 * the seqlock logic goes bad. We can not serialize this because the arch 2021 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1775,11 +2040,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1775 preempt_disable(); 2040 preempt_disable();
1776 ++userpg->lock; 2041 ++userpg->lock;
1777 barrier(); 2042 barrier();
1778 userpg->index = counter->hw.idx; 2043 userpg->index = perf_counter_index(counter);
1779 userpg->offset = atomic64_read(&counter->count); 2044 userpg->offset = atomic64_read(&counter->count);
1780 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 2045 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1781 userpg->offset -= atomic64_read(&counter->hw.prev_count); 2046 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1782 2047
2048 userpg->time_enabled = counter->total_time_enabled +
2049 atomic64_read(&counter->child_total_time_enabled);
2050
2051 userpg->time_running = counter->total_time_running +
2052 atomic64_read(&counter->child_total_time_running);
2053
1783 barrier(); 2054 barrier();
1784 ++userpg->lock; 2055 ++userpg->lock;
1785 preempt_enable(); 2056 preempt_enable();
@@ -1876,7 +2147,7 @@ fail:
1876 2147
1877static void perf_mmap_free_page(unsigned long addr) 2148static void perf_mmap_free_page(unsigned long addr)
1878{ 2149{
1879 struct page *page = virt_to_page(addr); 2150 struct page *page = virt_to_page((void *)addr);
1880 2151
1881 page->mapping = NULL; 2152 page->mapping = NULL;
1882 __free_page(page); 2153 __free_page(page);
@@ -2076,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2076 2347
2077 if (counter->pending_disable) { 2348 if (counter->pending_disable) {
2078 counter->pending_disable = 0; 2349 counter->pending_disable = 0;
2079 perf_counter_disable(counter); 2350 __perf_counter_disable(counter);
2080 } 2351 }
2081 2352
2082 if (counter->pending_wakeup) { 2353 if (counter->pending_wakeup) {
@@ -2461,7 +2732,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2461 return task_pid_nr_ns(p, counter->ns); 2732 return task_pid_nr_ns(p, counter->ns);
2462} 2733}
2463 2734
2464static void perf_counter_output(struct perf_counter *counter, int nmi, 2735static void perf_output_read_one(struct perf_output_handle *handle,
2736 struct perf_counter *counter)
2737{
2738 u64 read_format = counter->attr.read_format;
2739 u64 values[4];
2740 int n = 0;
2741
2742 values[n++] = atomic64_read(&counter->count);
2743 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2744 values[n++] = counter->total_time_enabled +
2745 atomic64_read(&counter->child_total_time_enabled);
2746 }
2747 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2748 values[n++] = counter->total_time_running +
2749 atomic64_read(&counter->child_total_time_running);
2750 }
2751 if (read_format & PERF_FORMAT_ID)
2752 values[n++] = primary_counter_id(counter);
2753
2754 perf_output_copy(handle, values, n * sizeof(u64));
2755}
2756
2757/*
2758 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2759 */
2760static void perf_output_read_group(struct perf_output_handle *handle,
2761 struct perf_counter *counter)
2762{
2763 struct perf_counter *leader = counter->group_leader, *sub;
2764 u64 read_format = counter->attr.read_format;
2765 u64 values[5];
2766 int n = 0;
2767
2768 values[n++] = 1 + leader->nr_siblings;
2769
2770 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2771 values[n++] = leader->total_time_enabled;
2772
2773 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2774 values[n++] = leader->total_time_running;
2775
2776 if (leader != counter)
2777 leader->pmu->read(leader);
2778
2779 values[n++] = atomic64_read(&leader->count);
2780 if (read_format & PERF_FORMAT_ID)
2781 values[n++] = primary_counter_id(leader);
2782
2783 perf_output_copy(handle, values, n * sizeof(u64));
2784
2785 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2786 n = 0;
2787
2788 if (sub != counter)
2789 sub->pmu->read(sub);
2790
2791 values[n++] = atomic64_read(&sub->count);
2792 if (read_format & PERF_FORMAT_ID)
2793 values[n++] = primary_counter_id(sub);
2794
2795 perf_output_copy(handle, values, n * sizeof(u64));
2796 }
2797}
2798
2799static void perf_output_read(struct perf_output_handle *handle,
2800 struct perf_counter *counter)
2801{
2802 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2803 perf_output_read_group(handle, counter);
2804 else
2805 perf_output_read_one(handle, counter);
2806}
2807
2808void perf_counter_output(struct perf_counter *counter, int nmi,
2465 struct perf_sample_data *data) 2809 struct perf_sample_data *data)
2466{ 2810{
2467 int ret; 2811 int ret;
@@ -2472,10 +2816,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2472 struct { 2816 struct {
2473 u32 pid, tid; 2817 u32 pid, tid;
2474 } tid_entry; 2818 } tid_entry;
2475 struct {
2476 u64 id;
2477 u64 counter;
2478 } group_entry;
2479 struct perf_callchain_entry *callchain = NULL; 2819 struct perf_callchain_entry *callchain = NULL;
2480 int callchain_size = 0; 2820 int callchain_size = 0;
2481 u64 time; 2821 u64 time;
@@ -2483,15 +2823,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2483 u32 cpu, reserved; 2823 u32 cpu, reserved;
2484 } cpu_entry; 2824 } cpu_entry;
2485 2825
2486 header.type = 0; 2826 header.type = PERF_EVENT_SAMPLE;
2487 header.size = sizeof(header); 2827 header.size = sizeof(header);
2488 2828
2489 header.misc = PERF_EVENT_MISC_OVERFLOW; 2829 header.misc = 0;
2490 header.misc |= perf_misc_flags(data->regs); 2830 header.misc |= perf_misc_flags(data->regs);
2491 2831
2492 if (sample_type & PERF_SAMPLE_IP) { 2832 if (sample_type & PERF_SAMPLE_IP) {
2493 ip = perf_instruction_pointer(data->regs); 2833 ip = perf_instruction_pointer(data->regs);
2494 header.type |= PERF_SAMPLE_IP;
2495 header.size += sizeof(ip); 2834 header.size += sizeof(ip);
2496 } 2835 }
2497 2836
@@ -2500,7 +2839,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2500 tid_entry.pid = perf_counter_pid(counter, current); 2839 tid_entry.pid = perf_counter_pid(counter, current);
2501 tid_entry.tid = perf_counter_tid(counter, current); 2840 tid_entry.tid = perf_counter_tid(counter, current);
2502 2841
2503 header.type |= PERF_SAMPLE_TID;
2504 header.size += sizeof(tid_entry); 2842 header.size += sizeof(tid_entry);
2505 } 2843 }
2506 2844
@@ -2510,47 +2848,51 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2510 */ 2848 */
2511 time = sched_clock(); 2849 time = sched_clock();
2512 2850
2513 header.type |= PERF_SAMPLE_TIME;
2514 header.size += sizeof(u64); 2851 header.size += sizeof(u64);
2515 } 2852 }
2516 2853
2517 if (sample_type & PERF_SAMPLE_ADDR) { 2854 if (sample_type & PERF_SAMPLE_ADDR)
2518 header.type |= PERF_SAMPLE_ADDR;
2519 header.size += sizeof(u64); 2855 header.size += sizeof(u64);
2520 }
2521 2856
2522 if (sample_type & PERF_SAMPLE_ID) { 2857 if (sample_type & PERF_SAMPLE_ID)
2523 header.type |= PERF_SAMPLE_ID; 2858 header.size += sizeof(u64);
2859
2860 if (sample_type & PERF_SAMPLE_STREAM_ID)
2524 header.size += sizeof(u64); 2861 header.size += sizeof(u64);
2525 }
2526 2862
2527 if (sample_type & PERF_SAMPLE_CPU) { 2863 if (sample_type & PERF_SAMPLE_CPU) {
2528 header.type |= PERF_SAMPLE_CPU;
2529 header.size += sizeof(cpu_entry); 2864 header.size += sizeof(cpu_entry);
2530 2865
2531 cpu_entry.cpu = raw_smp_processor_id(); 2866 cpu_entry.cpu = raw_smp_processor_id();
2867 cpu_entry.reserved = 0;
2532 } 2868 }
2533 2869
2534 if (sample_type & PERF_SAMPLE_PERIOD) { 2870 if (sample_type & PERF_SAMPLE_PERIOD)
2535 header.type |= PERF_SAMPLE_PERIOD;
2536 header.size += sizeof(u64); 2871 header.size += sizeof(u64);
2537 }
2538 2872
2539 if (sample_type & PERF_SAMPLE_GROUP) { 2873 if (sample_type & PERF_SAMPLE_READ)
2540 header.type |= PERF_SAMPLE_GROUP; 2874 header.size += perf_counter_read_size(counter);
2541 header.size += sizeof(u64) +
2542 counter->nr_siblings * sizeof(group_entry);
2543 }
2544 2875
2545 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2876 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2546 callchain = perf_callchain(data->regs); 2877 callchain = perf_callchain(data->regs);
2547 2878
2548 if (callchain) { 2879 if (callchain) {
2549 callchain_size = (1 + callchain->nr) * sizeof(u64); 2880 callchain_size = (1 + callchain->nr) * sizeof(u64);
2550
2551 header.type |= PERF_SAMPLE_CALLCHAIN;
2552 header.size += callchain_size; 2881 header.size += callchain_size;
2553 } 2882 } else
2883 header.size += sizeof(u64);
2884 }
2885
2886 if (sample_type & PERF_SAMPLE_RAW) {
2887 int size = sizeof(u32);
2888
2889 if (data->raw)
2890 size += data->raw->size;
2891 else
2892 size += sizeof(u32);
2893
2894 WARN_ON_ONCE(size & (sizeof(u64)-1));
2895 header.size += size;
2554 } 2896 }
2555 2897
2556 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2898 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2571,7 +2913,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2571 if (sample_type & PERF_SAMPLE_ADDR) 2913 if (sample_type & PERF_SAMPLE_ADDR)
2572 perf_output_put(&handle, data->addr); 2914 perf_output_put(&handle, data->addr);
2573 2915
2574 if (sample_type & PERF_SAMPLE_ID) 2916 if (sample_type & PERF_SAMPLE_ID) {
2917 u64 id = primary_counter_id(counter);
2918
2919 perf_output_put(&handle, id);
2920 }
2921
2922 if (sample_type & PERF_SAMPLE_STREAM_ID)
2575 perf_output_put(&handle, counter->id); 2923 perf_output_put(&handle, counter->id);
2576 2924
2577 if (sample_type & PERF_SAMPLE_CPU) 2925 if (sample_type & PERF_SAMPLE_CPU)
@@ -2580,76 +2928,125 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2580 if (sample_type & PERF_SAMPLE_PERIOD) 2928 if (sample_type & PERF_SAMPLE_PERIOD)
2581 perf_output_put(&handle, data->period); 2929 perf_output_put(&handle, data->period);
2582 2930
2583 /* 2931 if (sample_type & PERF_SAMPLE_READ)
2584 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2932 perf_output_read(&handle, counter);
2585 */
2586 if (sample_type & PERF_SAMPLE_GROUP) {
2587 struct perf_counter *leader, *sub;
2588 u64 nr = counter->nr_siblings;
2589 2933
2590 perf_output_put(&handle, nr); 2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2591 2935 if (callchain)
2592 leader = counter->group_leader; 2936 perf_output_copy(&handle, callchain, callchain_size);
2593 list_for_each_entry(sub, &leader->sibling_list, list_entry) { 2937 else {
2594 if (sub != counter) 2938 u64 nr = 0;
2595 sub->pmu->read(sub); 2939 perf_output_put(&handle, nr);
2596 2940 }
2597 group_entry.id = sub->id; 2941 }
2598 group_entry.counter = atomic64_read(&sub->count);
2599 2942
2600 perf_output_put(&handle, group_entry); 2943 if (sample_type & PERF_SAMPLE_RAW) {
2944 if (data->raw) {
2945 perf_output_put(&handle, data->raw->size);
2946 perf_output_copy(&handle, data->raw->data, data->raw->size);
2947 } else {
2948 struct {
2949 u32 size;
2950 u32 data;
2951 } raw = {
2952 .size = sizeof(u32),
2953 .data = 0,
2954 };
2955 perf_output_put(&handle, raw);
2601 } 2956 }
2602 } 2957 }
2603 2958
2604 if (callchain) 2959 perf_output_end(&handle);
2605 perf_output_copy(&handle, callchain, callchain_size); 2960}
2961
2962/*
2963 * read event
2964 */
2965
2966struct perf_read_event {
2967 struct perf_event_header header;
2968
2969 u32 pid;
2970 u32 tid;
2971};
2972
2973static void
2974perf_counter_read_event(struct perf_counter *counter,
2975 struct task_struct *task)
2976{
2977 struct perf_output_handle handle;
2978 struct perf_read_event event = {
2979 .header = {
2980 .type = PERF_EVENT_READ,
2981 .misc = 0,
2982 .size = sizeof(event) + perf_counter_read_size(counter),
2983 },
2984 .pid = perf_counter_pid(counter, task),
2985 .tid = perf_counter_tid(counter, task),
2986 };
2987 int ret;
2988
2989 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
2990 if (ret)
2991 return;
2992
2993 perf_output_put(&handle, event);
2994 perf_output_read(&handle, counter);
2606 2995
2607 perf_output_end(&handle); 2996 perf_output_end(&handle);
2608} 2997}
2609 2998
2610/* 2999/*
2611 * fork tracking 3000 * task tracking -- fork/exit
3001 *
3002 * enabled by: attr.comm | attr.mmap | attr.task
2612 */ 3003 */
2613 3004
2614struct perf_fork_event { 3005struct perf_task_event {
2615 struct task_struct *task; 3006 struct task_struct *task;
3007 struct perf_counter_context *task_ctx;
2616 3008
2617 struct { 3009 struct {
2618 struct perf_event_header header; 3010 struct perf_event_header header;
2619 3011
2620 u32 pid; 3012 u32 pid;
2621 u32 ppid; 3013 u32 ppid;
3014 u32 tid;
3015 u32 ptid;
2622 } event; 3016 } event;
2623}; 3017};
2624 3018
2625static void perf_counter_fork_output(struct perf_counter *counter, 3019static void perf_counter_task_output(struct perf_counter *counter,
2626 struct perf_fork_event *fork_event) 3020 struct perf_task_event *task_event)
2627{ 3021{
2628 struct perf_output_handle handle; 3022 struct perf_output_handle handle;
2629 int size = fork_event->event.header.size; 3023 int size = task_event->event.header.size;
2630 struct task_struct *task = fork_event->task; 3024 struct task_struct *task = task_event->task;
2631 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3025 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2632 3026
2633 if (ret) 3027 if (ret)
2634 return; 3028 return;
2635 3029
2636 fork_event->event.pid = perf_counter_pid(counter, task); 3030 task_event->event.pid = perf_counter_pid(counter, task);
2637 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3031 task_event->event.ppid = perf_counter_pid(counter, current);
3032
3033 task_event->event.tid = perf_counter_tid(counter, task);
3034 task_event->event.ptid = perf_counter_tid(counter, current);
2638 3035
2639 perf_output_put(&handle, fork_event->event); 3036 perf_output_put(&handle, task_event->event);
2640 perf_output_end(&handle); 3037 perf_output_end(&handle);
2641} 3038}
2642 3039
2643static int perf_counter_fork_match(struct perf_counter *counter) 3040static int perf_counter_task_match(struct perf_counter *counter)
2644{ 3041{
2645 if (counter->attr.comm || counter->attr.mmap) 3042 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2646 return 1; 3043 return 1;
2647 3044
2648 return 0; 3045 return 0;
2649} 3046}
2650 3047
2651static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3048static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2652 struct perf_fork_event *fork_event) 3049 struct perf_task_event *task_event)
2653{ 3050{
2654 struct perf_counter *counter; 3051 struct perf_counter *counter;
2655 3052
@@ -2658,51 +3055,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2658 3055
2659 rcu_read_lock(); 3056 rcu_read_lock();
2660 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3057 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2661 if (perf_counter_fork_match(counter)) 3058 if (perf_counter_task_match(counter))
2662 perf_counter_fork_output(counter, fork_event); 3059 perf_counter_task_output(counter, task_event);
2663 } 3060 }
2664 rcu_read_unlock(); 3061 rcu_read_unlock();
2665} 3062}
2666 3063
2667static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3064static void perf_counter_task_event(struct perf_task_event *task_event)
2668{ 3065{
2669 struct perf_cpu_context *cpuctx; 3066 struct perf_cpu_context *cpuctx;
2670 struct perf_counter_context *ctx; 3067 struct perf_counter_context *ctx = task_event->task_ctx;
2671 3068
2672 cpuctx = &get_cpu_var(perf_cpu_context); 3069 cpuctx = &get_cpu_var(perf_cpu_context);
2673 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3070 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2674 put_cpu_var(perf_cpu_context); 3071 put_cpu_var(perf_cpu_context);
2675 3072
2676 rcu_read_lock(); 3073 rcu_read_lock();
2677 /* 3074 if (!ctx)
2678 * doesn't really matter which of the child contexts the 3075 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2679 * events ends up in.
2680 */
2681 ctx = rcu_dereference(current->perf_counter_ctxp);
2682 if (ctx) 3076 if (ctx)
2683 perf_counter_fork_ctx(ctx, fork_event); 3077 perf_counter_task_ctx(ctx, task_event);
2684 rcu_read_unlock(); 3078 rcu_read_unlock();
2685} 3079}
2686 3080
2687void perf_counter_fork(struct task_struct *task) 3081static void perf_counter_task(struct task_struct *task,
3082 struct perf_counter_context *task_ctx,
3083 int new)
2688{ 3084{
2689 struct perf_fork_event fork_event; 3085 struct perf_task_event task_event;
2690 3086
2691 if (!atomic_read(&nr_comm_counters) && 3087 if (!atomic_read(&nr_comm_counters) &&
2692 !atomic_read(&nr_mmap_counters)) 3088 !atomic_read(&nr_mmap_counters) &&
3089 !atomic_read(&nr_task_counters))
2693 return; 3090 return;
2694 3091
2695 fork_event = (struct perf_fork_event){ 3092 task_event = (struct perf_task_event){
2696 .task = task, 3093 .task = task,
2697 .event = { 3094 .task_ctx = task_ctx,
3095 .event = {
2698 .header = { 3096 .header = {
2699 .type = PERF_EVENT_FORK, 3097 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2700 .size = sizeof(fork_event.event), 3098 .misc = 0,
3099 .size = sizeof(task_event.event),
2701 }, 3100 },
3101 /* .pid */
3102 /* .ppid */
3103 /* .tid */
3104 /* .ptid */
2702 }, 3105 },
2703 }; 3106 };
2704 3107
2705 perf_counter_fork_event(&fork_event); 3108 perf_counter_task_event(&task_event);
3109}
3110
3111void perf_counter_fork(struct task_struct *task)
3112{
3113 perf_counter_task(task, NULL, 1);
2706} 3114}
2707 3115
2708/* 3116/*
@@ -2770,8 +3178,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2770 struct perf_cpu_context *cpuctx; 3178 struct perf_cpu_context *cpuctx;
2771 struct perf_counter_context *ctx; 3179 struct perf_counter_context *ctx;
2772 unsigned int size; 3180 unsigned int size;
2773 char *comm = comm_event->task->comm; 3181 char comm[TASK_COMM_LEN];
2774 3182
3183 memset(comm, 0, sizeof(comm));
3184 strncpy(comm, comm_event->task->comm, sizeof(comm));
2775 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3185 size = ALIGN(strlen(comm)+1, sizeof(u64));
2776 3186
2777 comm_event->comm = comm; 3187 comm_event->comm = comm;
@@ -2798,13 +3208,24 @@ void perf_counter_comm(struct task_struct *task)
2798{ 3208{
2799 struct perf_comm_event comm_event; 3209 struct perf_comm_event comm_event;
2800 3210
3211 if (task->perf_counter_ctxp)
3212 perf_counter_enable_on_exec(task);
3213
2801 if (!atomic_read(&nr_comm_counters)) 3214 if (!atomic_read(&nr_comm_counters))
2802 return; 3215 return;
2803 3216
2804 comm_event = (struct perf_comm_event){ 3217 comm_event = (struct perf_comm_event){
2805 .task = task, 3218 .task = task,
3219 /* .comm */
3220 /* .comm_size */
2806 .event = { 3221 .event = {
2807 .header = { .type = PERF_EVENT_COMM, }, 3222 .header = {
3223 .type = PERF_EVENT_COMM,
3224 .misc = 0,
3225 /* .size */
3226 },
3227 /* .pid */
3228 /* .tid */
2808 }, 3229 },
2809 }; 3230 };
2810 3231
@@ -2887,8 +3308,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2887 char *buf = NULL; 3308 char *buf = NULL;
2888 const char *name; 3309 const char *name;
2889 3310
3311 memset(tmp, 0, sizeof(tmp));
3312
2890 if (file) { 3313 if (file) {
2891 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3314 /*
3315 * d_path works from the end of the buffer backwards, so we
3316 * need to add enough zero bytes after the string to handle
3317 * the 64bit alignment we do later.
3318 */
3319 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
2892 if (!buf) { 3320 if (!buf) {
2893 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3321 name = strncpy(tmp, "//enomem", sizeof(tmp));
2894 goto got_name; 3322 goto got_name;
@@ -2899,9 +3327,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2899 goto got_name; 3327 goto got_name;
2900 } 3328 }
2901 } else { 3329 } else {
2902 name = arch_vma_name(mmap_event->vma); 3330 if (arch_vma_name(mmap_event->vma)) {
2903 if (name) 3331 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3332 sizeof(tmp));
2904 goto got_name; 3333 goto got_name;
3334 }
2905 3335
2906 if (!vma->vm_mm) { 3336 if (!vma->vm_mm) {
2907 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3337 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -2946,8 +3376,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2946 3376
2947 mmap_event = (struct perf_mmap_event){ 3377 mmap_event = (struct perf_mmap_event){
2948 .vma = vma, 3378 .vma = vma,
3379 /* .file_name */
3380 /* .file_size */
2949 .event = { 3381 .event = {
2950 .header = { .type = PERF_EVENT_MMAP, }, 3382 .header = {
3383 .type = PERF_EVENT_MMAP,
3384 .misc = 0,
3385 /* .size */
3386 },
3387 /* .pid */
3388 /* .tid */
2951 .start = vma->vm_start, 3389 .start = vma->vm_start,
2952 .len = vma->vm_end - vma->vm_start, 3390 .len = vma->vm_end - vma->vm_start,
2953 .pgoff = vma->vm_pgoff, 3391 .pgoff = vma->vm_pgoff,
@@ -2958,49 +3396,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2958} 3396}
2959 3397
2960/* 3398/*
2961 * Log sample_period changes so that analyzing tools can re-normalize the
2962 * event flow.
2963 */
2964
2965struct freq_event {
2966 struct perf_event_header header;
2967 u64 time;
2968 u64 id;
2969 u64 period;
2970};
2971
2972static void perf_log_period(struct perf_counter *counter, u64 period)
2973{
2974 struct perf_output_handle handle;
2975 struct freq_event event;
2976 int ret;
2977
2978 if (counter->hw.sample_period == period)
2979 return;
2980
2981 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2982 return;
2983
2984 event = (struct freq_event) {
2985 .header = {
2986 .type = PERF_EVENT_PERIOD,
2987 .misc = 0,
2988 .size = sizeof(event),
2989 },
2990 .time = sched_clock(),
2991 .id = counter->id,
2992 .period = period,
2993 };
2994
2995 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2996 if (ret)
2997 return;
2998
2999 perf_output_put(&handle, event);
3000 perf_output_end(&handle);
3001}
3002
3003/*
3004 * IRQ throttle logging 3399 * IRQ throttle logging
3005 */ 3400 */
3006 3401
@@ -3013,16 +3408,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3013 struct perf_event_header header; 3408 struct perf_event_header header;
3014 u64 time; 3409 u64 time;
3015 u64 id; 3410 u64 id;
3411 u64 stream_id;
3016 } throttle_event = { 3412 } throttle_event = {
3017 .header = { 3413 .header = {
3018 .type = PERF_EVENT_THROTTLE + 1, 3414 .type = PERF_EVENT_THROTTLE,
3019 .misc = 0, 3415 .misc = 0,
3020 .size = sizeof(throttle_event), 3416 .size = sizeof(throttle_event),
3021 }, 3417 },
3022 .time = sched_clock(), 3418 .time = sched_clock(),
3023 .id = counter->id, 3419 .id = primary_counter_id(counter),
3420 .stream_id = counter->id,
3024 }; 3421 };
3025 3422
3423 if (enable)
3424 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3425
3026 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3426 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3027 if (ret) 3427 if (ret)
3028 return; 3428 return;
@@ -3099,125 +3499,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3099 * Generic software counter infrastructure 3499 * Generic software counter infrastructure
3100 */ 3500 */
3101 3501
3102static void perf_swcounter_update(struct perf_counter *counter) 3502/*
3503 * We directly increment counter->count and keep a second value in
3504 * counter->hw.period_left to count intervals. This period counter
3505 * is kept in the range [-sample_period, 0] so that we can use the
3506 * sign as trigger.
3507 */
3508
3509static u64 perf_swcounter_set_period(struct perf_counter *counter)
3103{ 3510{
3104 struct hw_perf_counter *hwc = &counter->hw; 3511 struct hw_perf_counter *hwc = &counter->hw;
3105 u64 prev, now; 3512 u64 period = hwc->last_period;
3106 s64 delta; 3513 u64 nr, offset;
3514 s64 old, val;
3515
3516 hwc->last_period = hwc->sample_period;
3107 3517
3108again: 3518again:
3109 prev = atomic64_read(&hwc->prev_count); 3519 old = val = atomic64_read(&hwc->period_left);
3110 now = atomic64_read(&hwc->count); 3520 if (val < 0)
3111 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3521 return 0;
3112 goto again;
3113 3522
3114 delta = now - prev; 3523 nr = div64_u64(period + val, period);
3524 offset = nr * period;
3525 val -= offset;
3526 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3527 goto again;
3115 3528
3116 atomic64_add(delta, &counter->count); 3529 return nr;
3117 atomic64_sub(delta, &hwc->period_left);
3118} 3530}
3119 3531
3120static void perf_swcounter_set_period(struct perf_counter *counter) 3532static void perf_swcounter_overflow(struct perf_counter *counter,
3533 int nmi, struct perf_sample_data *data)
3121{ 3534{
3122 struct hw_perf_counter *hwc = &counter->hw; 3535 struct hw_perf_counter *hwc = &counter->hw;
3123 s64 left = atomic64_read(&hwc->period_left); 3536 u64 overflow;
3124 s64 period = hwc->sample_period;
3125 3537
3126 if (unlikely(left <= -period)) { 3538 data->period = counter->hw.last_period;
3127 left = period; 3539 overflow = perf_swcounter_set_period(counter);
3128 atomic64_set(&hwc->period_left, left);
3129 hwc->last_period = period;
3130 }
3131 3540
3132 if (unlikely(left <= 0)) { 3541 if (hwc->interrupts == MAX_INTERRUPTS)
3133 left += period; 3542 return;
3134 atomic64_add(period, &hwc->period_left);
3135 hwc->last_period = period;
3136 }
3137 3543
3138 atomic64_set(&hwc->prev_count, -left); 3544 for (; overflow; overflow--) {
3139 atomic64_set(&hwc->count, -left); 3545 if (perf_counter_overflow(counter, nmi, data)) {
3546 /*
3547 * We inhibit the overflow from happening when
3548 * hwc->interrupts == MAX_INTERRUPTS.
3549 */
3550 break;
3551 }
3552 }
3140} 3553}
3141 3554
3142static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3555static void perf_swcounter_unthrottle(struct perf_counter *counter)
3143{ 3556{
3144 enum hrtimer_restart ret = HRTIMER_RESTART;
3145 struct perf_sample_data data;
3146 struct perf_counter *counter;
3147 u64 period;
3148
3149 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3150 counter->pmu->read(counter);
3151
3152 data.addr = 0;
3153 data.regs = get_irq_regs();
3154 /* 3557 /*
3155 * In case we exclude kernel IPs or are somehow not in interrupt 3558 * Nothing to do, we already reset hwc->interrupts.
3156 * context, provide the next best thing, the user IP.
3157 */ 3559 */
3158 if ((counter->attr.exclude_kernel || !data.regs) && 3560}
3159 !counter->attr.exclude_user)
3160 data.regs = task_pt_regs(current);
3161 3561
3162 if (data.regs) { 3562static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3163 if (perf_counter_overflow(counter, 0, &data)) 3563 int nmi, struct perf_sample_data *data)
3164 ret = HRTIMER_NORESTART; 3564{
3165 } 3565 struct hw_perf_counter *hwc = &counter->hw;
3166 3566
3167 period = max_t(u64, 10000, counter->hw.sample_period); 3567 atomic64_add(nr, &counter->count);
3168 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3169 3568
3170 return ret; 3569 if (!hwc->sample_period)
3171} 3570 return;
3172 3571
3173static void perf_swcounter_overflow(struct perf_counter *counter, 3572 if (!data->regs)
3174 int nmi, struct perf_sample_data *data) 3573 return;
3175{
3176 data->period = counter->hw.last_period;
3177 3574
3178 perf_swcounter_update(counter); 3575 if (!atomic64_add_negative(nr, &hwc->period_left))
3179 perf_swcounter_set_period(counter); 3576 perf_swcounter_overflow(counter, nmi, data);
3180 if (perf_counter_overflow(counter, nmi, data))
3181 /* soft-disable the counter */
3182 ;
3183} 3577}
3184 3578
3185static int perf_swcounter_is_counting(struct perf_counter *counter) 3579static int perf_swcounter_is_counting(struct perf_counter *counter)
3186{ 3580{
3187 struct perf_counter_context *ctx; 3581 /*
3188 unsigned long flags; 3582 * The counter is active, we're good!
3189 int count; 3583 */
3190
3191 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3584 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3192 return 1; 3585 return 1;
3193 3586
3587 /*
3588 * The counter is off/error, not counting.
3589 */
3194 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3590 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3195 return 0; 3591 return 0;
3196 3592
3197 /* 3593 /*
3198 * If the counter is inactive, it could be just because 3594 * The counter is inactive, if the context is active
3199 * its task is scheduled out, or because it's in a group 3595 * we're part of a group that didn't make it on the 'pmu',
3200 * which could not go on the PMU. We want to count in 3596 * not counting.
3201 * the first case but not the second. If the context is
3202 * currently active then an inactive software counter must
3203 * be the second case. If it's not currently active then
3204 * we need to know whether the counter was active when the
3205 * context was last active, which we can determine by
3206 * comparing counter->tstamp_stopped with ctx->time.
3207 *
3208 * We are within an RCU read-side critical section,
3209 * which protects the existence of *ctx.
3210 */ 3597 */
3211 ctx = counter->ctx; 3598 if (counter->ctx->is_active)
3212 spin_lock_irqsave(&ctx->lock, flags); 3599 return 0;
3213 count = 1; 3600
3214 /* Re-check state now we have the lock */ 3601 /*
3215 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3602 * We're inactive and the context is too, this means the
3216 counter->ctx->is_active || 3603 * task is scheduled out, we're counting events that happen
3217 counter->tstamp_stopped < ctx->time) 3604 * to us, like migration events.
3218 count = 0; 3605 */
3219 spin_unlock_irqrestore(&ctx->lock, flags); 3606 return 1;
3220 return count;
3221} 3607}
3222 3608
3223static int perf_swcounter_match(struct perf_counter *counter, 3609static int perf_swcounter_match(struct perf_counter *counter,
@@ -3243,15 +3629,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
3243 return 1; 3629 return 1;
3244} 3630}
3245 3631
3246static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3247 int nmi, struct perf_sample_data *data)
3248{
3249 int neg = atomic64_add_negative(nr, &counter->hw.count);
3250
3251 if (counter->hw.sample_period && !neg && data->regs)
3252 perf_swcounter_overflow(counter, nmi, data);
3253}
3254
3255static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3632static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3256 enum perf_type_id type, 3633 enum perf_type_id type,
3257 u32 event, u64 nr, int nmi, 3634 u32 event, u64 nr, int nmi,
@@ -3317,8 +3694,8 @@ out:
3317 put_cpu_var(perf_cpu_context); 3694 put_cpu_var(perf_cpu_context);
3318} 3695}
3319 3696
3320void 3697void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3321perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3698 struct pt_regs *regs, u64 addr)
3322{ 3699{
3323 struct perf_sample_data data = { 3700 struct perf_sample_data data = {
3324 .regs = regs, 3701 .regs = regs,
@@ -3330,27 +3707,66 @@ perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3330 3707
3331static void perf_swcounter_read(struct perf_counter *counter) 3708static void perf_swcounter_read(struct perf_counter *counter)
3332{ 3709{
3333 perf_swcounter_update(counter);
3334} 3710}
3335 3711
3336static int perf_swcounter_enable(struct perf_counter *counter) 3712static int perf_swcounter_enable(struct perf_counter *counter)
3337{ 3713{
3338 perf_swcounter_set_period(counter); 3714 struct hw_perf_counter *hwc = &counter->hw;
3715
3716 if (hwc->sample_period) {
3717 hwc->last_period = hwc->sample_period;
3718 perf_swcounter_set_period(counter);
3719 }
3339 return 0; 3720 return 0;
3340} 3721}
3341 3722
3342static void perf_swcounter_disable(struct perf_counter *counter) 3723static void perf_swcounter_disable(struct perf_counter *counter)
3343{ 3724{
3344 perf_swcounter_update(counter);
3345} 3725}
3346 3726
3347static const struct pmu perf_ops_generic = { 3727static const struct pmu perf_ops_generic = {
3348 .enable = perf_swcounter_enable, 3728 .enable = perf_swcounter_enable,
3349 .disable = perf_swcounter_disable, 3729 .disable = perf_swcounter_disable,
3350 .read = perf_swcounter_read, 3730 .read = perf_swcounter_read,
3731 .unthrottle = perf_swcounter_unthrottle,
3351}; 3732};
3352 3733
3353/* 3734/*
3735 * hrtimer based swcounter callback
3736 */
3737
3738static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3739{
3740 enum hrtimer_restart ret = HRTIMER_RESTART;
3741 struct perf_sample_data data;
3742 struct perf_counter *counter;
3743 u64 period;
3744
3745 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3746 counter->pmu->read(counter);
3747
3748 data.addr = 0;
3749 data.regs = get_irq_regs();
3750 /*
3751 * In case we exclude kernel IPs or are somehow not in interrupt
3752 * context, provide the next best thing, the user IP.
3753 */
3754 if ((counter->attr.exclude_kernel || !data.regs) &&
3755 !counter->attr.exclude_user)
3756 data.regs = task_pt_regs(current);
3757
3758 if (data.regs) {
3759 if (perf_counter_overflow(counter, 0, &data))
3760 ret = HRTIMER_NORESTART;
3761 }
3762
3763 period = max_t(u64, 10000, counter->hw.sample_period);
3764 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3765
3766 return ret;
3767}
3768
3769/*
3354 * Software counter: cpu wall time clock 3770 * Software counter: cpu wall time clock
3355 */ 3771 */
3356 3772
@@ -3467,17 +3883,24 @@ static const struct pmu perf_ops_task_clock = {
3467}; 3883};
3468 3884
3469#ifdef CONFIG_EVENT_PROFILE 3885#ifdef CONFIG_EVENT_PROFILE
3470void perf_tpcounter_event(int event_id) 3886void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3887 int entry_size)
3471{ 3888{
3889 struct perf_raw_record raw = {
3890 .size = entry_size,
3891 .data = record,
3892 };
3893
3472 struct perf_sample_data data = { 3894 struct perf_sample_data data = {
3473 .regs = get_irq_regs(); 3895 .regs = get_irq_regs(),
3474 .addr = 0, 3896 .addr = addr,
3897 .raw = &raw,
3475 }; 3898 };
3476 3899
3477 if (!data.regs) 3900 if (!data.regs)
3478 data.regs = task_pt_regs(current); 3901 data.regs = task_pt_regs(current);
3479 3902
3480 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); 3903 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3481} 3904}
3482EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3905EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3483 3906
@@ -3486,16 +3909,20 @@ extern void ftrace_profile_disable(int);
3486 3909
3487static void tp_perf_counter_destroy(struct perf_counter *counter) 3910static void tp_perf_counter_destroy(struct perf_counter *counter)
3488{ 3911{
3489 ftrace_profile_disable(perf_event_id(&counter->attr)); 3912 ftrace_profile_disable(counter->attr.config);
3490} 3913}
3491 3914
3492static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3915static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3493{ 3916{
3494 int event_id = perf_event_id(&counter->attr); 3917 /*
3495 int ret; 3918 * Raw tracepoint data is a severe data leak, only allow root to
3919 * have these.
3920 */
3921 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3922 !capable(CAP_SYS_ADMIN))
3923 return ERR_PTR(-EPERM);
3496 3924
3497 ret = ftrace_profile_enable(event_id); 3925 if (ftrace_profile_enable(counter->attr.config))
3498 if (ret)
3499 return NULL; 3926 return NULL;
3500 3927
3501 counter->destroy = tp_perf_counter_destroy; 3928 counter->destroy = tp_perf_counter_destroy;
@@ -3509,9 +3936,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3509} 3936}
3510#endif 3937#endif
3511 3938
3939atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3940
3941static void sw_perf_counter_destroy(struct perf_counter *counter)
3942{
3943 u64 event = counter->attr.config;
3944
3945 WARN_ON(counter->parent);
3946
3947 atomic_dec(&perf_swcounter_enabled[event]);
3948}
3949
3512static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 3950static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3513{ 3951{
3514 const struct pmu *pmu = NULL; 3952 const struct pmu *pmu = NULL;
3953 u64 event = counter->attr.config;
3515 3954
3516 /* 3955 /*
3517 * Software counters (currently) can't in general distinguish 3956 * Software counters (currently) can't in general distinguish
@@ -3520,7 +3959,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3520 * to be kernel events, and page faults are never hypervisor 3959 * to be kernel events, and page faults are never hypervisor
3521 * events. 3960 * events.
3522 */ 3961 */
3523 switch (counter->attr.config) { 3962 switch (event) {
3524 case PERF_COUNT_SW_CPU_CLOCK: 3963 case PERF_COUNT_SW_CPU_CLOCK:
3525 pmu = &perf_ops_cpu_clock; 3964 pmu = &perf_ops_cpu_clock;
3526 3965
@@ -3541,6 +3980,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3541 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 3980 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3542 case PERF_COUNT_SW_CONTEXT_SWITCHES: 3981 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3543 case PERF_COUNT_SW_CPU_MIGRATIONS: 3982 case PERF_COUNT_SW_CPU_MIGRATIONS:
3983 if (!counter->parent) {
3984 atomic_inc(&perf_swcounter_enabled[event]);
3985 counter->destroy = sw_perf_counter_destroy;
3986 }
3544 pmu = &perf_ops_generic; 3987 pmu = &perf_ops_generic;
3545 break; 3988 break;
3546 } 3989 }
@@ -3556,6 +3999,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3556 int cpu, 3999 int cpu,
3557 struct perf_counter_context *ctx, 4000 struct perf_counter_context *ctx,
3558 struct perf_counter *group_leader, 4001 struct perf_counter *group_leader,
4002 struct perf_counter *parent_counter,
3559 gfp_t gfpflags) 4003 gfp_t gfpflags)
3560{ 4004{
3561 const struct pmu *pmu; 4005 const struct pmu *pmu;
@@ -3591,6 +4035,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3591 counter->ctx = ctx; 4035 counter->ctx = ctx;
3592 counter->oncpu = -1; 4036 counter->oncpu = -1;
3593 4037
4038 counter->parent = parent_counter;
4039
3594 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 4040 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3595 counter->id = atomic64_inc_return(&perf_counter_id); 4041 counter->id = atomic64_inc_return(&perf_counter_id);
3596 4042
@@ -3609,9 +4055,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3609 atomic64_set(&hwc->period_left, hwc->sample_period); 4055 atomic64_set(&hwc->period_left, hwc->sample_period);
3610 4056
3611 /* 4057 /*
3612 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4058 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3613 */ 4059 */
3614 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4060 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3615 goto done; 4061 goto done;
3616 4062
3617 switch (attr->type) { 4063 switch (attr->type) {
@@ -3648,11 +4094,15 @@ done:
3648 4094
3649 counter->pmu = pmu; 4095 counter->pmu = pmu;
3650 4096
3651 atomic_inc(&nr_counters); 4097 if (!counter->parent) {
3652 if (counter->attr.mmap) 4098 atomic_inc(&nr_counters);
3653 atomic_inc(&nr_mmap_counters); 4099 if (counter->attr.mmap)
3654 if (counter->attr.comm) 4100 atomic_inc(&nr_mmap_counters);
3655 atomic_inc(&nr_comm_counters); 4101 if (counter->attr.comm)
4102 atomic_inc(&nr_comm_counters);
4103 if (counter->attr.task)
4104 atomic_inc(&nr_task_counters);
4105 }
3656 4106
3657 return counter; 4107 return counter;
3658} 4108}
@@ -3815,7 +4265,7 @@ SYSCALL_DEFINE5(perf_counter_open,
3815 } 4265 }
3816 4266
3817 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4267 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3818 GFP_KERNEL); 4268 NULL, GFP_KERNEL);
3819 ret = PTR_ERR(counter); 4269 ret = PTR_ERR(counter);
3820 if (IS_ERR(counter)) 4270 if (IS_ERR(counter))
3821 goto err_put_context; 4271 goto err_put_context;
@@ -3881,7 +4331,8 @@ inherit_counter(struct perf_counter *parent_counter,
3881 4331
3882 child_counter = perf_counter_alloc(&parent_counter->attr, 4332 child_counter = perf_counter_alloc(&parent_counter->attr,
3883 parent_counter->cpu, child_ctx, 4333 parent_counter->cpu, child_ctx,
3884 group_leader, GFP_KERNEL); 4334 group_leader, parent_counter,
4335 GFP_KERNEL);
3885 if (IS_ERR(child_counter)) 4336 if (IS_ERR(child_counter))
3886 return child_counter; 4337 return child_counter;
3887 get_ctx(child_ctx); 4338 get_ctx(child_ctx);
@@ -3904,12 +4355,6 @@ inherit_counter(struct perf_counter *parent_counter,
3904 */ 4355 */
3905 add_counter_to_ctx(child_counter, child_ctx); 4356 add_counter_to_ctx(child_counter, child_ctx);
3906 4357
3907 child_counter->parent = parent_counter;
3908 /*
3909 * inherit into child's child as well:
3910 */
3911 child_counter->attr.inherit = 1;
3912
3913 /* 4358 /*
3914 * Get a reference to the parent filp - we will fput it 4359 * Get a reference to the parent filp - we will fput it
3915 * when the child counter exits. This is safe to do because 4360 * when the child counter exits. This is safe to do because
@@ -3953,10 +4398,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3953} 4398}
3954 4399
3955static void sync_child_counter(struct perf_counter *child_counter, 4400static void sync_child_counter(struct perf_counter *child_counter,
3956 struct perf_counter *parent_counter) 4401 struct task_struct *child)
3957{ 4402{
4403 struct perf_counter *parent_counter = child_counter->parent;
3958 u64 child_val; 4404 u64 child_val;
3959 4405
4406 if (child_counter->attr.inherit_stat)
4407 perf_counter_read_event(child_counter, child);
4408
3960 child_val = atomic64_read(&child_counter->count); 4409 child_val = atomic64_read(&child_counter->count);
3961 4410
3962 /* 4411 /*
@@ -3985,7 +4434,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3985 4434
3986static void 4435static void
3987__perf_counter_exit_task(struct perf_counter *child_counter, 4436__perf_counter_exit_task(struct perf_counter *child_counter,
3988 struct perf_counter_context *child_ctx) 4437 struct perf_counter_context *child_ctx,
4438 struct task_struct *child)
3989{ 4439{
3990 struct perf_counter *parent_counter; 4440 struct perf_counter *parent_counter;
3991 4441
@@ -3999,7 +4449,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3999 * counters need to be zapped - but otherwise linger. 4449 * counters need to be zapped - but otherwise linger.
4000 */ 4450 */
4001 if (parent_counter) { 4451 if (parent_counter) {
4002 sync_child_counter(child_counter, parent_counter); 4452 sync_child_counter(child_counter, child);
4003 free_counter(child_counter); 4453 free_counter(child_counter);
4004 } 4454 }
4005} 4455}
@@ -4013,8 +4463,10 @@ void perf_counter_exit_task(struct task_struct *child)
4013 struct perf_counter_context *child_ctx; 4463 struct perf_counter_context *child_ctx;
4014 unsigned long flags; 4464 unsigned long flags;
4015 4465
4016 if (likely(!child->perf_counter_ctxp)) 4466 if (likely(!child->perf_counter_ctxp)) {
4467 perf_counter_task(child, NULL, 0);
4017 return; 4468 return;
4469 }
4018 4470
4019 local_irq_save(flags); 4471 local_irq_save(flags);
4020 /* 4472 /*
@@ -4033,17 +4485,20 @@ void perf_counter_exit_task(struct task_struct *child)
4033 */ 4485 */
4034 spin_lock(&child_ctx->lock); 4486 spin_lock(&child_ctx->lock);
4035 child->perf_counter_ctxp = NULL; 4487 child->perf_counter_ctxp = NULL;
4036 if (child_ctx->parent_ctx) { 4488 /*
4037 /* 4489 * If this context is a clone; unclone it so it can't get
4038 * This context is a clone; unclone it so it can't get 4490 * swapped to another process while we're removing all
4039 * swapped to another process while we're removing all 4491 * the counters from it.
4040 * the counters from it. 4492 */
4041 */ 4493 unclone_ctx(child_ctx);
4042 put_ctx(child_ctx->parent_ctx); 4494 spin_unlock_irqrestore(&child_ctx->lock, flags);
4043 child_ctx->parent_ctx = NULL; 4495
4044 } 4496 /*
4045 spin_unlock(&child_ctx->lock); 4497 * Report the task dead after unscheduling the counters so that we
4046 local_irq_restore(flags); 4498 * won't get any samples after PERF_EVENT_EXIT. We can however still
4499 * get a few PERF_EVENT_READ events.
4500 */
4501 perf_counter_task(child, child_ctx, 0);
4047 4502
4048 /* 4503 /*
4049 * We can recurse on the same lock type through: 4504 * We can recurse on the same lock type through:
@@ -4061,7 +4516,7 @@ void perf_counter_exit_task(struct task_struct *child)
4061again: 4516again:
4062 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4517 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4063 list_entry) 4518 list_entry)
4064 __perf_counter_exit_task(child_counter, child_ctx); 4519 __perf_counter_exit_task(child_counter, child_ctx, child);
4065 4520
4066 /* 4521 /*
4067 * If the last counter was a group counter, it will have appended all 4522 * If the last counter was a group counter, it will have appended all
@@ -4264,6 +4719,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4264 perf_counter_init_cpu(cpu); 4719 perf_counter_init_cpu(cpu);
4265 break; 4720 break;
4266 4721
4722 case CPU_ONLINE:
4723 case CPU_ONLINE_FROZEN:
4724 hw_perf_counter_setup_online(cpu);
4725 break;
4726
4267 case CPU_DOWN_PREPARE: 4727 case CPU_DOWN_PREPARE:
4268 case CPU_DOWN_PREPARE_FROZEN: 4728 case CPU_DOWN_PREPARE_FROZEN:
4269 perf_counter_exit_cpu(cpu); 4729 perf_counter_exit_cpu(cpu);
@@ -4288,6 +4748,8 @@ void __init perf_counter_init(void)
4288{ 4748{
4289 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4749 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4290 (void *)(long)smp_processor_id()); 4750 (void *)(long)smp_processor_id());
4751 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4752 (void *)(long)smp_processor_id());
4291 register_cpu_notifier(&perf_cpu_nb); 4753 register_cpu_notifier(&perf_cpu_nb);
4292} 4754}
4293 4755
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 69911b5745eb..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
181 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace. 182 * under ptrace.
183 */ 183 */
184 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
185 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
186 goto out; 186 goto out;
187 187
188 task_lock(task); 188 task_lock(task);
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5a..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { 1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1044 1043 spin_unlock(&lock->wait_lock);
1045 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1; 1045 return 1;
1047 } 1046 }
1048 1047
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050 1049
1051
1052 if (ret && !waiter->task) { 1050 if (ret && !waiter->task) {
1053 /* 1051 /*
1054 * Reset the return value. We might have 1052 * Reset the return value. We might have
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ae80bec1c1e..cda8b81f8801 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -493,6 +493,7 @@ struct rt_rq {
493#endif 493#endif
494#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
496 int overloaded; 497 int overloaded;
497 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
498#endif 499#endif
@@ -2571,15 +2572,37 @@ static void __sched_fork(struct task_struct *p)
2571 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2572 2573
2573#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2574 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2575 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2576 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2577 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2578 p->se.sleep_max = 0; 2579
2579 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2580 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2581 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2582 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2583#endif 2606#endif
2584 2607
2585 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -6541,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6541 return 0; 6564 return 0;
6542} 6565}
6543 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6544static void __cond_resched(void) 6572static void __cond_resched(void)
6545{ 6573{
6546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6560,8 +6588,7 @@ static void __cond_resched(void)
6560 6588
6561int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6562{ 6590{
6563 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6564 system_state == SYSTEM_RUNNING) {
6565 __cond_resched(); 6592 __cond_resched();
6566 return 1; 6593 return 1;
6567 } 6594 }
@@ -6579,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6579 */ 6606 */
6580int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6581{ 6608{
6582 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6583 int ret = 0; 6610 int ret = 0;
6584 6611
6585 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6586 spin_unlock(lock); 6613 spin_unlock(lock);
6587 if (resched && need_resched()) 6614 if (resched)
6588 __cond_resched(); 6615 __cond_resched();
6589 else 6616 else
6590 cpu_relax(); 6617 cpu_relax();
@@ -6599,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6599{ 6626{
6600 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6601 6628
6602 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6603 local_bh_enable(); 6630 local_bh_enable();
6604 __cond_resched(); 6631 __cond_resched();
6605 local_bh_disable(); 6632 local_bh_disable();
@@ -7277,6 +7304,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7277static void calc_global_load_remove(struct rq *rq) 7304static void calc_global_load_remove(struct rq *rq)
7278{ 7305{
7279 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7306 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7307 rq->calc_load_active = 0;
7280} 7308}
7281#endif /* CONFIG_HOTPLUG_CPU */ 7309#endif /* CONFIG_HOTPLUG_CPU */
7282 7310
@@ -7503,6 +7531,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7503 task_rq_unlock(rq, &flags); 7531 task_rq_unlock(rq, &flags);
7504 get_task_struct(p); 7532 get_task_struct(p);
7505 cpu_rq(cpu)->migration_thread = p; 7533 cpu_rq(cpu)->migration_thread = p;
7534 rq->calc_load_update = calc_load_update;
7506 break; 7535 break;
7507 7536
7508 case CPU_ONLINE: 7537 case CPU_ONLINE:
@@ -7513,8 +7542,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7513 /* Update our root-domain */ 7542 /* Update our root-domain */
7514 rq = cpu_rq(cpu); 7543 rq = cpu_rq(cpu);
7515 spin_lock_irqsave(&rq->lock, flags); 7544 spin_lock_irqsave(&rq->lock, flags);
7516 rq->calc_load_update = calc_load_update;
7517 rq->calc_load_active = 0;
7518 if (rq->rd) { 7545 if (rq->rd) {
7519 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7546 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7520 7547
@@ -9085,7 +9112,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9085#ifdef CONFIG_SMP 9112#ifdef CONFIG_SMP
9086 rt_rq->rt_nr_migratory = 0; 9113 rt_rq->rt_nr_migratory = 0;
9087 rt_rq->overloaded = 0; 9114 rt_rq->overloaded = 0;
9088 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9115 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9089#endif 9116#endif
9090 9117
9091 rt_rq->rt_time = 0; 9118 rt_rq->rt_time = 0;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6c251790dde..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba7fd6e9556f..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 266 return min_vruntime;
267} 267}
268 268
269static inline int entity_before(struct sched_entity *a,
270 struct sched_entity *b)
271{
272 return (s64)(a->vruntime - b->vruntime) < 0;
273}
274
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 275static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 276{
271 return se->vruntime - cfs_rq->min_vruntime; 277 return se->vruntime - cfs_rq->min_vruntime;
@@ -605,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 611static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
606{ 612{
607#ifdef CONFIG_SCHEDSTATS 613#ifdef CONFIG_SCHEDSTATS
614 struct task_struct *tsk = NULL;
615
616 if (entity_is_task(se))
617 tsk = task_of(se);
618
608 if (se->sleep_start) { 619 if (se->sleep_start) {
609 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
610 struct task_struct *tsk = task_of(se);
611 621
612 if ((s64)delta < 0) 622 if ((s64)delta < 0)
613 delta = 0; 623 delta = 0;
@@ -618,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
618 se->sleep_start = 0; 628 se->sleep_start = 0;
619 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
620 630
621 account_scheduler_latency(tsk, delta >> 10, 1); 631 if (tsk)
632 account_scheduler_latency(tsk, delta >> 10, 1);
622 } 633 }
623 if (se->block_start) { 634 if (se->block_start) {
624 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
625 struct task_struct *tsk = task_of(se);
626 636
627 if ((s64)delta < 0) 637 if ((s64)delta < 0)
628 delta = 0; 638 delta = 0;
@@ -633,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 se->block_start = 0; 643 se->block_start = 0;
634 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
635 645
636 /* 646 if (tsk) {
637 * Blocking time is in units of nanosecs, so shift by 20 to 647 /*
638 * get a milliseconds-range estimation of the amount of 648 * Blocking time is in units of nanosecs, so shift by
639 * time that the task spent sleeping: 649 * 20 to get a milliseconds-range estimation of the
640 */ 650 * amount of time that the task spent sleeping:
641 if (unlikely(prof_on == SLEEP_PROFILING)) { 651 */
642 652 if (unlikely(prof_on == SLEEP_PROFILING)) {
643 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 653 profile_hits(SLEEP_PROFILING,
644 delta >> 20); 654 (void *)get_wchan(tsk),
655 delta >> 20);
656 }
657 account_scheduler_latency(tsk, delta >> 10, 0);
645 } 658 }
646 account_scheduler_latency(tsk, delta >> 10, 0);
647 } 659 }
648#endif 660#endif
649} 661}
@@ -687,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
687 * all of which have the same weight. 699 * all of which have the same weight.
688 */ 700 */
689 if (sched_feat(NORMALIZED_SLEEPER) && 701 if (sched_feat(NORMALIZED_SLEEPER) &&
690 task_of(se)->policy != SCHED_IDLE) 702 (!entity_is_task(se) ||
703 task_of(se)->policy != SCHED_IDLE))
691 thresh = calc_delta_fair(thresh, se); 704 thresh = calc_delta_fair(thresh, se);
692 705
693 vruntime -= thresh; 706 vruntime -= thresh;
@@ -1016,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
1016 /* 1029 /*
1017 * Already in the rightmost position? 1030 * Already in the rightmost position?
1018 */ 1031 */
1019 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1032 if (unlikely(!rightmost || entity_before(rightmost, se)))
1020 return; 1033 return;
1021 1034
1022 /* 1035 /*
@@ -1712,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1712 1725
1713 /* 'curr' will be NULL if the child belongs to a different group */ 1726 /* 'curr' will be NULL if the child belongs to a different group */
1714 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1727 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1715 curr && curr->vruntime < se->vruntime) { 1728 curr && entity_before(curr, se)) {
1716 /* 1729 /*
1717 * Upon rescheduling, sched_class::put_prev_task() will place 1730 * Upon rescheduling, sched_class::put_prev_task() will place
1718 * 'current' within the tree based on its new key value. 1731 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
10 10
11#ifdef CONFIG_RT_GROUP_SCHED 11#ifdef CONFIG_RT_GROUP_SCHED
12 12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 16{
15 return rt_rq->rq; 17 return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 24
23#else /* CONFIG_RT_GROUP_SCHED */ 25#else /* CONFIG_RT_GROUP_SCHED */
24 26
27#define rt_entity_is_task(rt_se) (1)
28
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 30{
27 return container_of(rt_rq, struct rq, rt); 31 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 77
74static void update_rt_migration(struct rt_rq *rt_rq) 78static void update_rt_migration(struct rt_rq *rt_rq)
75{ 79{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 80 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 81 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 82 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 83 rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 90
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 92{
93 if (!rt_entity_is_task(rt_se))
94 return;
95
96 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
97
98 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 99 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 100 rt_rq->rt_nr_migratory++;
91 101
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 104
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 106{
107 if (!rt_entity_is_task(rt_se))
108 return;
109
110 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
111
112 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 113 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 114 rt_rq->rt_nr_migratory--;
99 115
diff --git a/kernel/signal.c b/kernel/signal.c
index ccf1ceedaebe..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2454 stack_t oss; 2454 stack_t oss;
2455 int error; 2455 int error;
2456 2456
2457 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2458 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2459 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2460 oss.ss_flags = sas_ss_flags(sp);
2461 }
2462 2460
2463 if (uss) { 2461 if (uss) {
2464 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2466 int ss_flags; 2464 int ss_flags;
2467 2465
2468 error = -EFAULT; 2466 error = -EFAULT;
2469 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2470 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2471 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2472 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2473 goto out; 2473 goto out;
2474 2474
2475 error = -EPERM; 2475 error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2501 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2502 } 2502 }
2503 2503
2504 error = 0;
2504 if (uoss) { 2505 if (uoss) {
2505 error = -EFAULT; 2506 error = -EFAULT;
2506 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2507 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2508 } 2512 }
2509 2513
2510 error = 0;
2511out: 2514out:
2512 return error; 2515 return error;
2513} 2516}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..94188b8ecc33 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3a94905fa5d2..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
345 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
346} 346}
347 347
348/* Tasklets */ 348/*
349 * Tasklets
350 */
349struct tasklet_head 351struct tasklet_head
350{ 352{
351 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
493 495
494EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
495 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
496DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
497EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
498 560
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..98e02328c67d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
335 .data = &sysctl_timer_migration, 335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 336 .maxlen = sizeof(unsigned int),
337 .mode = 0644, 337 .mode = 0644,
338 .proc_handler = &proc_dointvec, 338 .proc_handler = &proc_dointvec_minmax,
339 .strategy = &sysctl_intvec,
340 .extra1 = &zero,
341 .extra2 = &one,
339 }, 342 },
340#endif 343#endif
341 { 344 {
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
744 .proc_handler = &proc_dointvec, 747 .proc_handler = &proc_dointvec,
745 }, 748 },
746 { 749 {
750 .ctl_name = CTL_UNNUMBERED,
751 .procname = "panic_on_io_nmi",
752 .data = &panic_on_io_nmi,
753 .maxlen = sizeof(int),
754 .mode = 0644,
755 .proc_handler = &proc_dointvec,
756 },
757 {
747 .ctl_name = KERN_BOOTLOADER_TYPE, 758 .ctl_name = KERN_BOOTLOADER_TYPE,
748 .procname = "bootloader_type", 759 .procname = "bootloader_type",
749 .data = &bootloader_type, 760 .data = &bootloader_type,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..a6dcd67b041d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg)
254 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
255} 255}
256EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 257#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
513 * Check to make sure we don't switch to a non-highres capable 513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz) 514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */ 515 */
516 if (tick_oneshot_mode_active() && 516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { 517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. " 518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name); 519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
@@ -712,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
712 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
713 * to be the same thing then just return: 715 * to be the same thing then just return:
714 */ 716 */
715 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
716 return 1; 718 return 1;
717 719
718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e7669..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -226,13 +226,13 @@ config BOOT_TRACER
226 the timings of the initcalls and traces key events and the identity 226 the timings of the initcalls and traces key events and the identity
227 of tasks that can cause boot delays, such as context-switches. 227 of tasks that can cause boot delays, such as context-switches.
228 228
229 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 229 Its aim is to be parsed by the scripts/bootgraph.pl tool to
230 produce pretty graphics about boot inefficiencies, giving a visual 230 produce pretty graphics about boot inefficiencies, giving a visual
231 representation of the delays during initcalls - but the raw 231 representation of the delays during initcalls - but the raw
232 /debug/tracing/trace text output is readable too. 232 /debug/tracing/trace text output is readable too.
233 233
234 You must pass in ftrace=initcall to the kernel command line 234 You must pass in initcall_debug and ftrace=initcall to the kernel
235 to enable this on bootup. 235 command line to enable this on bootup.
236 236
237config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
238 bool 238 bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..7a34cb563fec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
@@ -266,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)
266{ 267{
267 debugfs_remove(bt->msg_file); 268 debugfs_remove(bt->msg_file);
268 debugfs_remove(bt->dropped_file); 269 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
270 relay_close(bt->rchan); 270 relay_close(bt->rchan);
271 debugfs_remove(bt->dir);
271 free_percpu(bt->sequence); 272 free_percpu(bt->sequence);
272 free_percpu(bt->msg_data); 273 free_percpu(bt->msg_data);
273 kfree(bt); 274 kfree(bt);
@@ -377,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
377 378
378static int blk_remove_buf_file_callback(struct dentry *dentry) 379static int blk_remove_buf_file_callback(struct dentry *dentry)
379{ 380{
380 struct dentry *parent = dentry->d_parent;
381 debugfs_remove(dentry); 381 debugfs_remove(dentry);
382 382
383 /*
384 * this will fail for all but the last file, but that is ok. what we
385 * care about is the top level buts->name directory going away, when
386 * the last trace file is gone. Then we don't have to rmdir() that
387 * manually on trace stop, so it nicely solves the issue with
388 * force killing of running traces.
389 */
390
391 debugfs_remove(parent);
392 return 0; 383 return 0;
393} 384}
394 385
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3718d55fb4c3..1e1d23c26308 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -766,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
766 .stat_show = function_stat_show 768 .stat_show = function_stat_show
767}; 769};
768 770
769static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
770{ 772{
771 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
772 struct dentry *entry; 774 struct dentry *entry;
@@ -784,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
784 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
785 * we still do not free memory. 787 * we still do not free memory.
786 */ 788 */
787 kfree(stat);
788 WARN(1, 789 WARN(1,
789 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
790 cpu); 791 cpu);
@@ -811,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
811} 812}
812 813
813#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
815{ 816{
816} 817}
817#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1417,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417{ 1418{
1418 struct ftrace_iterator *iter = m->private; 1419 struct ftrace_iterator *iter = m->private;
1419 void *p = NULL; 1420 void *p = NULL;
1421 loff_t l;
1422
1423 if (!(iter->flags & FTRACE_ITER_HASH))
1424 *pos = 0;
1420 1425
1421 iter->flags |= FTRACE_ITER_HASH; 1426 iter->flags |= FTRACE_ITER_HASH;
1422 1427
1423 return t_hash_next(m, p, pos); 1428 iter->hidx = 0;
1429 for (l = 0; l <= *pos; ) {
1430 p = t_hash_next(m, p, &l);
1431 if (!p)
1432 break;
1433 }
1434 return p;
1424} 1435}
1425 1436
1426static int t_hash_show(struct seq_file *m, void *v) 1437static int t_hash_show(struct seq_file *m, void *v)
@@ -1467,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 iter->pg = iter->pg->next; 1478 iter->pg = iter->pg->next;
1468 iter->idx = 0; 1479 iter->idx = 0;
1469 goto retry; 1480 goto retry;
1470 } else {
1471 iter->idx = -1;
1472 } 1481 }
1473 } else { 1482 } else {
1474 rec = &iter->pg->records[iter->idx++]; 1483 rec = &iter->pg->records[iter->idx++];
@@ -1497,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1497{ 1506{
1498 struct ftrace_iterator *iter = m->private; 1507 struct ftrace_iterator *iter = m->private;
1499 void *p = NULL; 1508 void *p = NULL;
1509 loff_t l;
1500 1510
1501 mutex_lock(&ftrace_lock); 1511 mutex_lock(&ftrace_lock);
1502 /* 1512 /*
@@ -1508,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1508 if (*pos > 0) 1518 if (*pos > 0)
1509 return t_hash_start(m, pos); 1519 return t_hash_start(m, pos);
1510 iter->flags |= FTRACE_ITER_PRINTALL; 1520 iter->flags |= FTRACE_ITER_PRINTALL;
1511 (*pos)++;
1512 return iter; 1521 return iter;
1513 } 1522 }
1514 1523
1515 if (iter->flags & FTRACE_ITER_HASH) 1524 if (iter->flags & FTRACE_ITER_HASH)
1516 return t_hash_start(m, pos); 1525 return t_hash_start(m, pos);
1517 1526
1518 if (*pos > 0) { 1527 iter->pg = ftrace_pages_start;
1519 if (iter->idx < 0) 1528 iter->idx = 0;
1520 return p; 1529 for (l = 0; l <= *pos; ) {
1521 (*pos)--; 1530 p = t_next(m, p, &l);
1522 iter->idx--; 1531 if (!p)
1532 break;
1523 } 1533 }
1524 1534
1525 p = t_next(m, p, pos); 1535 if (!p && iter->flags & FTRACE_ITER_FILTER)
1526
1527 if (!p)
1528 return t_hash_start(m, pos); 1536 return t_hash_start(m, pos);
1529 1537
1530 return p; 1538 return p;
@@ -1654,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1654 1662
1655 mutex_lock(&ftrace_regex_lock); 1663 mutex_lock(&ftrace_regex_lock);
1656 if ((file->f_mode & FMODE_WRITE) && 1664 if ((file->f_mode & FMODE_WRITE) &&
1657 !(file->f_flags & O_APPEND)) 1665 (file->f_flags & O_TRUNC))
1658 ftrace_filter_reset(enable); 1666 ftrace_filter_reset(enable);
1659 1667
1660 if (file->f_mode & FMODE_READ) { 1668 if (file->f_mode & FMODE_READ) {
@@ -2500,32 +2508,31 @@ int ftrace_graph_count;
2500unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2508unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2501 2509
2502static void * 2510static void *
2503g_next(struct seq_file *m, void *v, loff_t *pos) 2511__g_next(struct seq_file *m, loff_t *pos)
2504{ 2512{
2505 unsigned long *array = m->private; 2513 unsigned long *array = m->private;
2506 int index = *pos;
2507
2508 (*pos)++;
2509 2514
2510 if (index >= ftrace_graph_count) 2515 if (*pos >= ftrace_graph_count)
2511 return NULL; 2516 return NULL;
2517 return &array[*pos];
2518}
2512 2519
2513 return &array[index]; 2520static void *
2521g_next(struct seq_file *m, void *v, loff_t *pos)
2522{
2523 (*pos)++;
2524 return __g_next(m, pos);
2514} 2525}
2515 2526
2516static void *g_start(struct seq_file *m, loff_t *pos) 2527static void *g_start(struct seq_file *m, loff_t *pos)
2517{ 2528{
2518 void *p = NULL;
2519
2520 mutex_lock(&graph_lock); 2529 mutex_lock(&graph_lock);
2521 2530
2522 /* Nothing, tell g_show to print all functions are enabled */ 2531 /* Nothing, tell g_show to print all functions are enabled */
2523 if (!ftrace_graph_count && !*pos) 2532 if (!ftrace_graph_count && !*pos)
2524 return (void *)1; 2533 return (void *)1;
2525 2534
2526 p = g_next(m, p, pos); 2535 return __g_next(m, pos);
2527
2528 return p;
2529} 2536}
2530 2537
2531static void g_stop(struct seq_file *m, void *p) 2538static void g_stop(struct seq_file *m, void *p)
@@ -2570,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2570 2577
2571 mutex_lock(&graph_lock); 2578 mutex_lock(&graph_lock);
2572 if ((file->f_mode & FMODE_WRITE) && 2579 if ((file->f_mode & FMODE_WRITE) &&
2573 !(file->f_flags & O_APPEND)) { 2580 (file->f_flags & O_TRUNC)) {
2574 ftrace_graph_count = 0; 2581 ftrace_graph_count = 0;
2575 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2582 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2576 } 2583 }
@@ -2589,6 +2596,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2589} 2596}
2590 2597
2591static int 2598static int
2599ftrace_graph_release(struct inode *inode, struct file *file)
2600{
2601 if (file->f_mode & FMODE_READ)
2602 seq_release(inode, file);
2603 return 0;
2604}
2605
2606static int
2592ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2607ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2593{ 2608{
2594 struct dyn_ftrace *rec; 2609 struct dyn_ftrace *rec;
@@ -2717,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2717} 2732}
2718 2733
2719static const struct file_operations ftrace_graph_fops = { 2734static const struct file_operations ftrace_graph_fops = {
2720 .open = ftrace_graph_open, 2735 .open = ftrace_graph_open,
2721 .read = seq_read, 2736 .read = seq_read,
2722 .write = ftrace_graph_write, 2737 .write = ftrace_graph_write,
2738 .release = ftrace_graph_release,
2723}; 2739};
2724#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2740#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2725 2741
@@ -3152,10 +3168,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3152 3168
3153 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3169 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3154 3170
3155 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3171 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3156 goto out; 3172 goto out;
3157 3173
3158 last_ftrace_enabled = ftrace_enabled; 3174 last_ftrace_enabled = !!ftrace_enabled;
3159 3175
3160 if (ftrace_enabled) { 3176 if (ftrace_enabled) {
3161 3177
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 04dac2638258..a330513d96ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
735 735
736 put_online_cpus(); 736 put_online_cpus();
737 737
738 kfree(buffer->buffers);
738 free_cpumask_var(buffer->cpumask); 739 free_cpumask_var(buffer->cpumask);
739 740
740 kfree(buffer); 741 kfree(buffer);
@@ -1563,6 +1564,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1563 return NULL; 1564 return NULL;
1564} 1565}
1565 1566
1567#ifdef CONFIG_TRACING
1568
1566#define TRACE_RECURSIVE_DEPTH 16 1569#define TRACE_RECURSIVE_DEPTH 16
1567 1570
1568static int trace_recursive_lock(void) 1571static int trace_recursive_lock(void)
@@ -1593,6 +1596,13 @@ static void trace_recursive_unlock(void)
1593 current->trace_recursion--; 1596 current->trace_recursion--;
1594} 1597}
1595 1598
1599#else
1600
1601#define trace_recursive_lock() (0)
1602#define trace_recursive_unlock() do { } while (0)
1603
1604#endif
1605
1596static DEFINE_PER_CPU(int, rb_need_resched); 1606static DEFINE_PER_CPU(int, rb_need_resched);
1597 1607
1598/** 1608/**
@@ -1776,7 +1786,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1776 */ 1786 */
1777 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1778 1788
1779 if (!rb_try_to_discard(cpu_buffer, event)) 1789 if (rb_try_to_discard(cpu_buffer, event))
1780 goto out; 1790 goto out;
1781 1791
1782 /* 1792 /*
@@ -2374,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2374 * the box. Return the padding, and we will release 2384 * the box. Return the padding, and we will release
2375 * the current locks, and try again. 2385 * the current locks, and try again.
2376 */ 2386 */
2377 rb_advance_reader(cpu_buffer);
2378 return event; 2387 return event;
2379 2388
2380 case RINGBUF_TYPE_TIME_EXTEND: 2389 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2477,7 +2486,7 @@ static inline int rb_ok_to_lock(void)
2477 * buffer too. A one time deal is all you get from reading 2486 * buffer too. A one time deal is all you get from reading
2478 * the ring buffer from an NMI. 2487 * the ring buffer from an NMI.
2479 */ 2488 */
2480 if (likely(!in_nmi() && !oops_in_progress)) 2489 if (likely(!in_nmi()))
2481 return 1; 2490 return 1;
2482 2491
2483 tracing_off_permanent(); 2492 tracing_off_permanent();
@@ -2510,6 +2519,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2510 if (dolock) 2519 if (dolock)
2511 spin_lock(&cpu_buffer->reader_lock); 2520 spin_lock(&cpu_buffer->reader_lock);
2512 event = rb_buffer_peek(buffer, cpu, ts); 2521 event = rb_buffer_peek(buffer, cpu, ts);
2522 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2523 rb_advance_reader(cpu_buffer);
2513 if (dolock) 2524 if (dolock)
2514 spin_unlock(&cpu_buffer->reader_lock); 2525 spin_unlock(&cpu_buffer->reader_lock);
2515 local_irq_restore(flags); 2526 local_irq_restore(flags);
@@ -2581,12 +2592,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2581 spin_lock(&cpu_buffer->reader_lock); 2592 spin_lock(&cpu_buffer->reader_lock);
2582 2593
2583 event = rb_buffer_peek(buffer, cpu, ts); 2594 event = rb_buffer_peek(buffer, cpu, ts);
2584 if (!event) 2595 if (event)
2585 goto out_unlock; 2596 rb_advance_reader(cpu_buffer);
2586
2587 rb_advance_reader(cpu_buffer);
2588 2597
2589 out_unlock:
2590 if (dolock) 2598 if (dolock)
2591 spin_unlock(&cpu_buffer->reader_lock); 2599 spin_unlock(&cpu_buffer->reader_lock);
2592 local_irq_restore(flags); 2600 local_irq_restore(flags);
@@ -3104,6 +3112,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3104} 3112}
3105EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3113EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3106 3114
3115#ifdef CONFIG_TRACING
3107static ssize_t 3116static ssize_t
3108rb_simple_read(struct file *filp, char __user *ubuf, 3117rb_simple_read(struct file *filp, char __user *ubuf,
3109 size_t cnt, loff_t *ppos) 3118 size_t cnt, loff_t *ppos)
@@ -3171,6 +3180,7 @@ static __init int rb_init_debugfs(void)
3171} 3180}
3172 3181
3173fs_initcall(rb_init_debugfs); 3182fs_initcall(rb_init_debugfs);
3183#endif
3174 3184
3175#ifdef CONFIG_HOTPLUG_CPU 3185#ifdef CONFIG_HOTPLUG_CPU
3176static int rb_cpu_notify(struct notifier_block *self, 3186static int rb_cpu_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 076fa6f0ee48..c22b40f8f576 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -284,13 +285,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
285{ 286{
286 unsigned long buf_size; 287 unsigned long buf_size;
287 int ret;
288 288
289 if (!str) 289 if (!str)
290 return 0; 290 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 291 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 292 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 293 if (buf_size == 0)
294 return 0; 294 return 0;
295 trace_buf_size = buf_size; 295 trace_buf_size = buf_size;
296 return 1; 296 return 1;
@@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
850} 850}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
851 852
852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
853 int type, 854 int type,
@@ -2031,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2031 2032
2032 /* If this file was open for write, then erase contents */ 2033 /* If this file was open for write, then erase contents */
2033 if ((file->f_mode & FMODE_WRITE) && 2034 if ((file->f_mode & FMODE_WRITE) &&
2034 !(file->f_flags & O_APPEND)) { 2035 (file->f_flags & O_TRUNC)) {
2035 long cpu = (long) inode->i_private; 2036 long cpu = (long) inode->i_private;
2036 2037
2037 if (cpu == TRACE_PIPE_ALL_CPU) 2038 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2053,25 +2054,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2054static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2055t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2056{
2056 struct tracer *t = m->private; 2057 struct tracer *t = v;
2057 2058
2058 (*pos)++; 2059 (*pos)++;
2059 2060
2060 if (t) 2061 if (t)
2061 t = t->next; 2062 t = t->next;
2062 2063
2063 m->private = t;
2064
2065 return t; 2064 return t;
2066} 2065}
2067 2066
2068static void *t_start(struct seq_file *m, loff_t *pos) 2067static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2068{
2070 struct tracer *t = m->private; 2069 struct tracer *t;
2071 loff_t l = 0; 2070 loff_t l = 0;
2072 2071
2073 mutex_lock(&trace_types_lock); 2072 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2073 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2074 ;
2076 2075
2077 return t; 2076 return t;
@@ -2107,18 +2106,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2106
2108static int show_traces_open(struct inode *inode, struct file *file) 2107static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2108{
2110 int ret;
2111
2112 if (tracing_disabled) 2109 if (tracing_disabled)
2113 return -ENODEV; 2110 return -ENODEV;
2114 2111
2115 ret = seq_open(file, &show_traces_seq_ops); 2112 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2113}
2123 2114
2124static ssize_t 2115static ssize_t
@@ -3095,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3095 break; 3086 break;
3096 } 3087 }
3097 3088
3098 trace_consume(iter); 3089 if (ret != TRACE_TYPE_NO_CONSUME)
3090 trace_consume(iter);
3099 rem -= count; 3091 rem -= count;
3100 if (!find_next_entry_inc(iter)) { 3092 if (!find_next_entry_inc(iter)) {
3101 rem = 0; 3093 rem = 0;
@@ -4243,8 +4235,11 @@ static void __ftrace_dump(bool disable_tracing)
4243 iter.pos = -1; 4235 iter.pos = -1;
4244 4236
4245 if (find_next_entry_inc(&iter) != NULL) { 4237 if (find_next_entry_inc(&iter) != NULL) {
4246 print_trace_line(&iter); 4238 int ret;
4247 trace_consume(&iter); 4239
4240 ret = print_trace_line(&iter);
4241 if (ret != TRACE_TYPE_NO_CONSUME)
4242 trace_consume(&iter);
4248 } 4243 }
4249 4244
4250 trace_printk_seq(&iter.seq); 4245 trace_printk_seq(&iter.seq);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..8b9f4f6e9559 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
439 int *ent_cpu, u64 *ent_ts); 439 int *ent_cpu, u64 *ent_ts);
440 440
441void tracing_generic_entry_update(struct trace_entry *entry,
442 unsigned long flags,
443 int pc);
444
445void default_wait_pipe(struct trace_iterator *iter); 441void default_wait_pipe(struct trace_iterator *iter);
446void poll_wait_pipe(struct trace_iterator *iter); 442void poll_wait_pipe(struct trace_iterator *iter);
447 443
@@ -597,6 +593,7 @@ print_graph_function(struct trace_iterator *iter)
597 593
598extern struct pid *ftrace_pid_trace; 594extern struct pid *ftrace_pid_trace;
599 595
596#ifdef CONFIG_FUNCTION_TRACER
600static inline int ftrace_trace_task(struct task_struct *task) 597static inline int ftrace_trace_task(struct task_struct *task)
601{ 598{
602 if (!ftrace_pid_trace) 599 if (!ftrace_pid_trace)
@@ -604,6 +601,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
604 601
605 return test_tsk_trace_trace(task); 602 return test_tsk_trace_trace(task);
606} 603}
604#else
605static inline int ftrace_trace_task(struct task_struct *task)
606{
607 return 1;
608}
609#endif
607 610
608/* 611/*
609 * trace_iterator_flags is an enumeration that defines bit 612 * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)
14 14
15 mutex_lock(&event_mutex); 15 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 16 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 17 if (event->id == event_id && event->profile_enable) {
18 ret = event->profile_enable(event); 18 ret = event->profile_enable(event);
19 break; 19 break;
20 } 20 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..e75276a49cf5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 300
301static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
302{ 302{
303 struct ftrace_event_call *call = NULL;
304 loff_t l;
305
303 mutex_lock(&event_mutex); 306 mutex_lock(&event_mutex);
304 if (*pos == 0) 307
305 m->private = ftrace_events.next; 308 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 309 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l);
311 if (!call)
312 break;
313 }
314 return call;
307} 315}
308 316
309static void * 317static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 340
333static void *s_start(struct seq_file *m, loff_t *pos) 341static void *s_start(struct seq_file *m, loff_t *pos)
334{ 342{
343 struct ftrace_event_call *call = NULL;
344 loff_t l;
345
335 mutex_lock(&event_mutex); 346 mutex_lock(&event_mutex);
336 if (*pos == 0) 347
337 m->private = ftrace_events.next; 348 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 349 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l);
351 if (!call)
352 break;
353 }
354 return call;
339} 355}
340 356
341static int t_show(struct seq_file *m, void *v) 357static int t_show(struct seq_file *m, void *v)
@@ -360,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
360 const struct seq_operations *seq_ops; 376 const struct seq_operations *seq_ops;
361 377
362 if ((file->f_mode & FMODE_WRITE) && 378 if ((file->f_mode & FMODE_WRITE) &&
363 !(file->f_flags & O_APPEND)) 379 (file->f_flags & O_TRUNC))
364 ftrace_clear_events(); 380 ftrace_clear_events();
365 381
366 seq_ops = inode->i_private; 382 seq_ops = inode->i_private;
@@ -924,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
924 entry = trace_create_file("enable", 0644, call->dir, call, 940 entry = trace_create_file("enable", 0644, call->dir, call,
925 enable); 941 enable);
926 942
927 if (call->id) 943 if (call->id && call->profile_enable)
928 entry = trace_create_file("id", 0444, call->dir, call, 944 entry = trace_create_file("id", 0444, call->dir, call,
929 id); 945 id);
930 946
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621bbf46..f32dc9d1ea7b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
624 return -ENOSPC; 624 return -ENOSPC;
625 } 625 }
626 626
627 filter->preds[filter->n_preds] = pred;
628 filter->n_preds++;
629
630 list_for_each_entry(call, &ftrace_events, list) { 627 list_for_each_entry(call, &ftrace_events, list) {
631 628
632 if (!call->define_fields) 629 if (!call->define_fields)
@@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
643 } 640 }
644 replace_filter_string(call->filter, filter_string); 641 replace_filter_string(call->filter, filter_string);
645 } 642 }
643
644 filter->preds[filter->n_preds] = pred;
645 filter->n_preds++;
646out: 646out:
647 return err; 647 return err;
648} 648}
@@ -1029,12 +1029,17 @@ static int replace_preds(struct event_subsystem *system,
1029 1029
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1030 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1031 pred = create_logical_pred(elt->op);
1032 if (!pred)
1033 return -ENOMEM;
1032 if (call) { 1034 if (call) {
1033 err = filter_add_pred(ps, call, pred); 1035 err = filter_add_pred(ps, call, pred);
1034 filter_free_pred(pred); 1036 filter_free_pred(pred);
1035 } else 1037 } else {
1036 err = filter_add_subsystem_pred(ps, system, 1038 err = filter_add_subsystem_pred(ps, system,
1037 pred, filter_string); 1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1038 if (err) 1043 if (err)
1039 return err; 1044 return err;
1040 1045
@@ -1048,12 +1053,17 @@ static int replace_preds(struct event_subsystem *system,
1048 } 1053 }
1049 1054
1050 pred = create_pred(elt->op, operand1, operand2); 1055 pred = create_pred(elt->op, operand1, operand2);
1056 if (!pred)
1057 return -ENOMEM;
1051 if (call) { 1058 if (call) {
1052 err = filter_add_pred(ps, call, pred); 1059 err = filter_add_pred(ps, call, pred);
1053 filter_free_pred(pred); 1060 filter_free_pred(pred);
1054 } else 1061 } else {
1055 err = filter_add_subsystem_pred(ps, system, pred, 1062 err = filter_add_subsystem_pred(ps, system, pred,
1056 filter_string); 1063 filter_string);
1064 if (err)
1065 filter_free_pred(pred);
1066 }
1057 if (err) 1067 if (err)
1058 return err; 1068 return err;
1059 1069
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 90f134764837..75ef000613c3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -302,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
302 if (count == -1) 302 if (count == -1)
303 seq_printf(m, ":unlimited\n"); 303 seq_printf(m, ":unlimited\n");
304 else 304 else
305 seq_printf(m, ":count=%ld", count); 305 seq_printf(m, ":count=%ld\n", count);
306 seq_putc(m, '\n');
307 306
308 return 0; 307 return 0;
309} 308}
@@ -364,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
364 out_reg: 363 out_reg:
365 ret = register_ftrace_function_probe(glob, ops, count); 364 ret = register_ftrace_function_probe(glob, ops, count);
366 365
367 return ret; 366 return ret < 0 ? ret : 0;
368} 367}
369 368
370static struct ftrace_func_command ftrace_traceon_cmd = { 369static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb53..420ec3487579 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
843 843
844 switch (entry->type) { 844 switch (entry->type) {
845 case TRACE_GRAPH_ENT: { 845 case TRACE_GRAPH_ENT: {
846 struct ftrace_graph_ent_entry *field; 846 /*
847 * print_graph_entry() may consume the current event,
848 * thus @field may become invalid, so we need to save it.
849 * sizeof(struct ftrace_graph_ent_entry) is very small,
850 * it can be safely saved at the stack.
851 */
852 struct ftrace_graph_ent_entry *field, saved;
847 trace_assign_type(field, entry); 853 trace_assign_type(field, entry);
848 return print_graph_entry(field, s, iter); 854 saved = *field;
855 return print_graph_entry(&saved, s, iter);
849 } 856 }
850 case TRACE_GRAPH_RET: { 857 case TRACE_GRAPH_RET: {
851 struct ftrace_graph_ret_entry *field; 858 struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
182 const char *str = *fmt; 176 const char *str = *fmt;
183 int i; 177 int i;
184 178
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
186 180
187 /* 181 /*
188 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..6a2a9d484cd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 301
302static int stack_trace_open(struct inode *inode, struct file *file) 302static int stack_trace_open(struct inode *inode, struct file *file)
303{ 303{
304 int ret; 304 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 305}
310 306
311static const struct file_operations stack_trace_fops = { 307static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 308 .open = stack_trace_open,
313 .read = seq_read, 309 .read = seq_read,
314 .llseek = seq_lseek, 310 .llseek = seq_lseek,
311 .release = seq_release,
315}; 312};
316 313
317int 314int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 323 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 324
328 if (ret || !write || 325 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 326 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 327 goto out;
331 328
332 last_stack_tracer_enabled = stack_tracer_enabled; 329 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 330
334 if (stack_tracer_enabled) 331 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 332 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..aea321c82fa0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node)
73 } 73 }
74} 74}
75 75
76static void reset_stat_session(struct stat_session *session) 76static void __reset_stat_session(struct stat_session *session)
77{ 77{
78 struct rb_node *node = session->stat_root.rb_node; 78 struct rb_node *node = session->stat_root.rb_node;
79 79
@@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session)
83 session->stat_root = RB_ROOT; 83 session->stat_root = RB_ROOT;
84} 84}
85 85
86static void reset_stat_session(struct stat_session *session)
87{
88 mutex_lock(&session->stat_mutex);
89 __reset_stat_session(session);
90 mutex_unlock(&session->stat_mutex);
91}
92
86static void destroy_session(struct stat_session *session) 93static void destroy_session(struct stat_session *session)
87{ 94{
88 debugfs_remove(session->file); 95 debugfs_remove(session->file);
89 reset_stat_session(session); 96 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 97 mutex_destroy(&session->stat_mutex);
91 kfree(session); 98 kfree(session);
92} 99}
@@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 157 int i;
151 158
152 mutex_lock(&session->stat_mutex); 159 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 160 __reset_stat_session(session);
154 161
155 if (!ts->stat_cmp) 162 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 163 ts->stat_cmp = dummy_cmp;
@@ -183,7 +190,7 @@ exit:
183 return ret; 190 return ret;
184 191
185exit_free_rbtree: 192exit_free_rbtree:
186 reset_stat_session(session); 193 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 194 mutex_unlock(&session->stat_mutex);
188 return ret; 195 return ret;
189} 196}
@@ -199,17 +206,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
199 mutex_lock(&session->stat_mutex); 206 mutex_lock(&session->stat_mutex);
200 207
201 /* If we are in the beginning of the file, print the headers */ 208 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 209 if (!*pos && session->ts->stat_headers)
203 (*pos)++;
204 return SEQ_START_TOKEN; 210 return SEQ_START_TOKEN;
205 }
206 211
207 node = rb_first(&session->stat_root); 212 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 213 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node); 214 node = rb_next(node);
210 215
211 (*pos)++;
212
213 return node; 216 return node;
214} 217}
215 218
@@ -254,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = {
254static int tracing_stat_open(struct inode *inode, struct file *file) 257static int tracing_stat_open(struct inode *inode, struct file *file)
255{ 258{
256 int ret; 259 int ret;
257 260 struct seq_file *m;
258 struct stat_session *session = inode->i_private; 261 struct stat_session *session = inode->i_private;
259 262
263 ret = stat_seq_init(session);
264 if (ret)
265 return ret;
266
260 ret = seq_open(file, &trace_stat_seq_ops); 267 ret = seq_open(file, &trace_stat_seq_ops);
261 if (!ret) { 268 if (ret) {
262 struct seq_file *m = file->private_data; 269 reset_stat_session(session);
263 m->private = session; 270 return ret;
264 ret = stat_seq_init(session);
265 } 271 }
266 272
273 m = file->private_data;
274 m->private = session;
267 return ret; 275 return ret;
268} 276}
269 277
@@ -274,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
274{ 282{
275 struct stat_session *session = i->i_private; 283 struct stat_session *session = i->i_private;
276 284
277 mutex_lock(&session->stat_mutex);
278 reset_stat_session(session); 285 reset_stat_session(session);
279 mutex_unlock(&session->stat_mutex);
280 286
281 return 0; 287 return seq_release(i, f);
282} 288}
283 289
284static const struct file_operations tracing_stat_fops = { 290static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275cf..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{