aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c66
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c151
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c74
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hrtimer.c110
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c82
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c19
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c1106
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcutree.c3
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c61
-rw-r--r--kernel/sched_cpupri.c15
-rw-r--r--kernel/sched_fair.c45
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/time/clockevents.c27
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c4
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c13
-rw-r--r--kernel/trace/ftrace.c101
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c49
-rw-r--r--kernel/trace/trace.h11
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c32
-rw-r--r--kernel/trace/trace_events_filter.c20
-rw-r--r--kernel/trace/trace_functions.c5
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c28
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/trace/trace_stat.c40
-rw-r--r--kernel/wait.c5
67 files changed, 2318 insertions, 1400 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0a32cb21ec97..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/ 74obj-$(CONFIG_GCOV_KERNEL) += gcov/
75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/
99obj-$(CONFIG_SMP) += sched_cpupri.o 100obj-$(CONFIG_SMP) += sched_cpupri.o
100obj-$(CONFIG_SLOW_WORK) += slow-work.o 101obj-$(CONFIG_SLOW_WORK) += slow-work.o
101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a682cdf5..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 736 * to zero, soon.
736 * 737 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 739 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 741
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 743{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
745} 746}
746 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
747static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 761 unsigned long final_bits)
749{ 762{
@@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
960 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
961 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
962 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
963 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
964} 978}
965static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1357 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1358 * is no longer empty. 1372 * is no longer empty.
1359 */ 1373 */
1360 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1361 return 0; 1375 return 0;
1362} 1376}
1363 1377
@@ -2201,12 +2215,30 @@ err:
2201 return ret; 2215 return ret;
2202} 2216}
2203 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2204static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2205{ 2238{
2206 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2207} 2240}
2208 2241
2209
2210/* 2242/*
2211 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2212 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2221 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2222 * next pid to display, if any 2254 * next pid to display, if any
2223 */ 2255 */
2224 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2225 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2226 int *iter; 2259 int *iter;
2227 2260
2228 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2229 if (pid) { 2262 if (pid) {
2230 int end = cgrp->pids_length; 2263 int end = cp->length;
2231 2264
2232 while (index < end) { 2265 while (index < end) {
2233 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2234 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2235 index = mid; 2268 index = mid;
2236 break; 2269 break;
2237 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2238 index = mid + 1; 2271 index = mid + 1;
2239 else 2272 else
2240 end = mid; 2273 end = mid;
2241 } 2274 }
2242 } 2275 }
2243 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2244 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2245 return NULL; 2278 return NULL;
2246 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2247 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2248 *pos = *iter; 2281 *pos = *iter;
2249 return iter; 2282 return iter;
2250} 2283}
2251 2284
2252static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2253{ 2286{
2254 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2255 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2256} 2290}
2257 2291
2258static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2259{ 2293{
2260 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2261 int *p = v; 2295 int *p = v;
2262 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2263 2297
2264 /* 2298 /*
2265 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2286 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2287}; 2321};
2288 2322
2289static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2290{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2291 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2292 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2293 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2294 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2295 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2296 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2297 } 2334 }
2298 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2299} 2336}
2300 2337
2301static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2302{ 2339{
2303 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2304 2342
2305 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2306 return 0; 2344 return 0;
2307 2345
2308 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2309 return seq_release(inode, file); 2350 return seq_release(inode, file);
2310} 2351}
2311 2352
@@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2324static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2325{ 2366{
2326 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2327 pid_t *pidarray; 2370 pid_t *pidarray;
2328 int npids; 2371 int npids;
2329 int retval; 2372 int retval;
@@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2350 * array if necessary 2393 * array if necessary
2351 */ 2394 */
2352 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2353 kfree(cgrp->tasks_pids); 2396
2354 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2355 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2356 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2357 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2358 2418
2359 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2360 2420
2361 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2362 if (retval) { 2422 if (retval) {
2363 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2364 return retval; 2424 return retval;
2365 } 2425 }
2366 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2367 return 0; 2427 return 0;
2368} 2428}
2369 2429
@@ -2696,33 +2756,42 @@ again:
2696 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2697 2757
2698 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2699 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2700 * that rmdir() request comes. 2771 * that rmdir() request comes.
2701 */ 2772 */
2702 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2703 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 return ret; 2776 return ret;
2777 }
2705 2778
2706 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2707 parent = cgrp->parent; 2780 parent = cgrp->parent;
2708 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2709 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2710 return -EBUSY; 2784 return -EBUSY;
2711 } 2785 }
2712 /*
2713 * css_put/get is provided for subsys to grab refcnt to css. In typical
2714 * case, subsystem has no reference after pre_destroy(). But, under
2715 * hierarchy management, some *temporal* refcnt can be hold.
2716 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2717 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2718 * is called when css_put() is called and refcnt goes down to 0.
2719 */
2720 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2721 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2722
2723 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2724 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2725 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2726 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2727 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2728 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3294 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3295 check_for_release(cgrp); 3364 check_for_release(cgrp);
3296 } 3365 }
3297 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3298 } 3367 }
3299 rcu_read_unlock(); 3368 rcu_read_unlock();
3300} 3369}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
34 * an ongoing cpu hotplug operation. 34 * an ongoing cpu hotplug operation.
35 */ 35 */
36 int refcount; 36 int refcount;
37} cpu_hotplug; 37} cpu_hotplug = {
38 38 .active_writer = NULL,
39void __init cpu_hotplug_init(void) 39 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
40{ 40 .refcount = 0,
41 cpu_hotplug.active_writer = NULL; 41};
42 mutex_init(&cpu_hotplug.lock);
43 cpu_hotplug.refcount = 0;
44}
45 42
46#ifdef CONFIG_HOTPLUG_CPU 43#ifdef CONFIG_HOTPLUG_CPU
47 44
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..e6c04d462ab2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 567 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 568 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 569 */
571 if (tsk->clear_child_tid 570 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 571 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 572 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 573 /*
574 * We don't check the error code - if userspace has
575 * not set up a proper pointer then tough luck.
576 */
577 put_user(0, tsk->clear_child_tid);
578 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
579 1, NULL, NULL, 0);
580 }
575 tsk->clear_child_tid = NULL; 581 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 582 }
584} 583}
585 584
@@ -816,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
816{ 815{
817 struct signal_struct *sig; 816 struct signal_struct *sig;
818 817
819 if (clone_flags & CLONE_THREAD) { 818 if (clone_flags & CLONE_THREAD)
820 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live);
822 return 0; 819 return 0;
823 }
824 820
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 821 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
826 tsk->signal = sig; 822 tsk->signal = sig;
@@ -878,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig)
878 kmem_cache_free(signal_cachep, sig); 874 kmem_cache_free(signal_cachep, sig);
879} 875}
880 876
881static void cleanup_signal(struct task_struct *tsk)
882{
883 struct signal_struct *sig = tsk->signal;
884
885 atomic_dec(&sig->live);
886
887 if (atomic_dec_and_test(&sig->count))
888 __cleanup_signal(sig);
889}
890
891static void copy_flags(unsigned long clone_flags, struct task_struct *p) 877static void copy_flags(unsigned long clone_flags, struct task_struct *p)
892{ 878{
893 unsigned long new_flags = p->flags; 879 unsigned long new_flags = p->flags;
@@ -1240,6 +1226,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1240 } 1226 }
1241 1227
1242 if (clone_flags & CLONE_THREAD) { 1228 if (clone_flags & CLONE_THREAD) {
1229 atomic_inc(&current->signal->count);
1230 atomic_inc(&current->signal->live);
1243 p->group_leader = current->group_leader; 1231 p->group_leader = current->group_leader;
1244 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1232 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1245 } 1233 }
@@ -1269,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1269 write_unlock_irq(&tasklist_lock); 1257 write_unlock_irq(&tasklist_lock);
1270 proc_fork_connector(p); 1258 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1259 cgroup_post_fork(p);
1260 perf_counter_fork(p);
1272 return p; 1261 return p;
1273 1262
1274bad_fork_free_pid: 1263bad_fork_free_pid:
@@ -1282,7 +1271,8 @@ bad_fork_cleanup_mm:
1282 if (p->mm) 1271 if (p->mm)
1283 mmput(p->mm); 1272 mmput(p->mm);
1284bad_fork_cleanup_signal: 1273bad_fork_cleanup_signal:
1285 cleanup_signal(p); 1274 if (!(clone_flags & CLONE_THREAD))
1275 __cleanup_signal(p->signal);
1286bad_fork_cleanup_sighand: 1276bad_fork_cleanup_sighand:
1287 __cleanup_sighand(p->sighand); 1277 __cleanup_sighand(p->sighand);
1288bad_fork_cleanup_fs: 1278bad_fork_cleanup_fs:
@@ -1408,12 +1398,6 @@ long do_fork(unsigned long clone_flags,
1408 if (clone_flags & CLONE_VFORK) { 1398 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1399 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1400 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1417 } 1401 }
1418 1402
1419 audit_finish_fork(p); 1403 audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..e18cfbdc7190 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
247 if (err < 0) 247 if (err < 0)
248 return err; 248 return err;
249 249
250 page = compound_head(page);
250 lock_page(page); 251 lock_page(page);
251 if (!page->mapping) { 252 if (!page->mapping) {
252 unlock_page(page); 253 unlock_page(page);
@@ -284,6 +285,25 @@ void put_futex_key(int fshared, union futex_key *key)
284 drop_futex_key_refs(key); 285 drop_futex_key_refs(key);
285} 286}
286 287
288/*
289 * fault_in_user_writeable - fault in user address and verify RW access
290 * @uaddr: pointer to faulting user space address
291 *
292 * Slow path to fixup the fault we just took in the atomic write
293 * access to @uaddr.
294 *
295 * We have no generic implementation of a non destructive write to the
296 * user address. We know that we faulted in the atomic pagefault
297 * disabled section so we can as well avoid the #PF overhead by
298 * calling get_user_pages() right away.
299 */
300static int fault_in_user_writeable(u32 __user *uaddr)
301{
302 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
303 1, 1, 0, NULL, NULL);
304 return ret < 0 ? ret : 0;
305}
306
287/** 307/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex 308 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in 309 * @hb: the hash bucket the futex_q's reside in
@@ -896,7 +916,6 @@ retry:
896retry_private: 916retry_private:
897 op_ret = futex_atomic_op_inuser(op, uaddr2); 917 op_ret = futex_atomic_op_inuser(op, uaddr2);
898 if (unlikely(op_ret < 0)) { 918 if (unlikely(op_ret < 0)) {
899 u32 dummy;
900 919
901 double_unlock_hb(hb1, hb2); 920 double_unlock_hb(hb1, hb2);
902 921
@@ -914,7 +933,7 @@ retry_private:
914 goto out_put_keys; 933 goto out_put_keys;
915 } 934 }
916 935
917 ret = get_user(dummy, uaddr2); 936 ret = fault_in_user_writeable(uaddr2);
918 if (ret) 937 if (ret)
919 goto out_put_keys; 938 goto out_put_keys;
920 939
@@ -991,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1010 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q 1011 * q: the futex_q
993 * key: the key of the requeue target futex 1012 * key: the key of the requeue target futex
1013 * hb: the hash_bucket of the requeue target futex
994 * 1014 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1015 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1016 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right 1017 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1018 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held. 1019 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1020 * to protect access to the pi_state to fixup the owner later. Must be called
1021 * with both q->lock_ptr and hb->lock held.
1000 */ 1022 */
1001static inline 1023static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) 1024void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1025 struct futex_hash_bucket *hb)
1003{ 1026{
1004 drop_futex_key_refs(&q->key); 1027 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key); 1028 get_futex_key_refs(key);
@@ -1011,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1011 WARN_ON(!q->rt_waiter); 1034 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL; 1035 q->rt_waiter = NULL;
1013 1036
1037 q->lock_ptr = &hb->lock;
1038#ifdef CONFIG_DEBUG_PI_LIST
1039 q->list.plist.lock = &hb->lock;
1040#endif
1041
1014 wake_up_state(q->task, TASK_NORMAL); 1042 wake_up_state(q->task, TASK_NORMAL);
1015} 1043}
1016 1044
@@ -1069,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1097 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters); 1098 set_waiters);
1071 if (ret == 1) 1099 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2); 1100 requeue_pi_wake_futex(top_waiter, key2, hb2);
1073 1101
1074 return ret; 1102 return ret;
1075} 1103}
@@ -1204,7 +1232,7 @@ retry_private:
1204 double_unlock_hb(hb1, hb2); 1232 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2); 1233 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1); 1234 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2); 1235 ret = fault_in_user_writeable(uaddr2);
1208 if (!ret) 1236 if (!ret)
1209 goto retry; 1237 goto retry;
1210 goto out; 1238 goto out;
@@ -1228,8 +1256,15 @@ retry_private:
1228 if (!match_futex(&this->key, &key1)) 1256 if (!match_futex(&this->key, &key1))
1229 continue; 1257 continue;
1230 1258
1231 WARN_ON(!requeue_pi && this->rt_waiter); 1259 /*
1232 WARN_ON(requeue_pi && !this->rt_waiter); 1260 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1261 * be paired with each other and no other futex ops.
1262 */
1263 if ((requeue_pi && !this->rt_waiter) ||
1264 (!requeue_pi && this->rt_waiter)) {
1265 ret = -EINVAL;
1266 break;
1267 }
1233 1268
1234 /* 1269 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1270 * Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1254,7 +1289,7 @@ retry_private:
1254 this->task, 1); 1289 this->task, 1);
1255 if (ret == 1) { 1290 if (ret == 1) {
1256 /* We got the lock. */ 1291 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2); 1292 requeue_pi_wake_futex(this, &key2, hb2);
1258 continue; 1293 continue;
1259 } else if (ret) { 1294 } else if (ret) {
1260 /* -EDEADLK */ 1295 /* -EDEADLK */
@@ -1482,7 +1517,7 @@ retry:
1482handle_fault: 1517handle_fault:
1483 spin_unlock(q->lock_ptr); 1518 spin_unlock(q->lock_ptr);
1484 1519
1485 ret = get_user(uval, uaddr); 1520 ret = fault_in_user_writeable(uaddr);
1486 1521
1487 spin_lock(q->lock_ptr); 1522 spin_lock(q->lock_ptr);
1488 1523
@@ -1807,7 +1842,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1807{ 1842{
1808 struct hrtimer_sleeper timeout, *to = NULL; 1843 struct hrtimer_sleeper timeout, *to = NULL;
1809 struct futex_hash_bucket *hb; 1844 struct futex_hash_bucket *hb;
1810 u32 uval;
1811 struct futex_q q; 1845 struct futex_q q;
1812 int res, ret; 1846 int res, ret;
1813 1847
@@ -1909,16 +1943,9 @@ out:
1909 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1943 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910 1944
1911uaddr_faulted: 1945uaddr_faulted:
1912 /*
1913 * We have to r/w *(int __user *)uaddr, and we have to modify it
1914 * atomically. Therefore, if we continue to fault after get_user()
1915 * below, we need to handle the fault ourselves, while still holding
1916 * the mmap_sem. This can occur if the uaddr is under contention as
1917 * we have to drop the mmap_sem in order to call get_user().
1918 */
1919 queue_unlock(&q, hb); 1946 queue_unlock(&q, hb);
1920 1947
1921 ret = get_user(uval, uaddr); 1948 ret = fault_in_user_writeable(uaddr);
1922 if (ret) 1949 if (ret)
1923 goto out_put_key; 1950 goto out_put_key;
1924 1951
@@ -2013,17 +2040,10 @@ out:
2013 return ret; 2040 return ret;
2014 2041
2015pi_faulted: 2042pi_faulted:
2016 /*
2017 * We have to r/w *(int __user *)uaddr, and we have to modify it
2018 * atomically. Therefore, if we continue to fault after get_user()
2019 * below, we need to handle the fault ourselves, while still holding
2020 * the mmap_sem. This can occur if the uaddr is under contention as
2021 * we have to drop the mmap_sem in order to call get_user().
2022 */
2023 spin_unlock(&hb->lock); 2043 spin_unlock(&hb->lock);
2024 put_futex_key(fshared, &key); 2044 put_futex_key(fshared, &key);
2025 2045
2026 ret = get_user(uval, uaddr); 2046 ret = fault_in_user_writeable(uaddr);
2027 if (!ret) 2047 if (!ret)
2028 goto retry; 2048 goto retry;
2029 2049
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9002958a96e7..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -1276,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1276 1282
1277 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1278 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1279 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1280 1296
1281 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1282 ktime_t basenow; 1298 ktime_t basenow;
1283 struct rb_node *node; 1299 struct rb_node *node;
1284 1300
1285 spin_lock(&cpu_base->lock);
1286
1287 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1288 1302
1289 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1316,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1316 1330
1317 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1318 } 1332 }
1319 spin_unlock(&cpu_base->lock);
1320 base++; 1333 base++;
1321 } 1334 }
1322 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1323 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1324 1342
1325 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1326 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47 46
48/* 47/*
49 * Debugging printout: 48 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50da67672901..0ec9ed831737 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -443,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
443 return -1; 451 return -1;
444} 452}
445 453
454#ifdef CONFIG_SMP
455/*
456 * Check whether we need to change the affinity of the interrupt thread.
457 */
458static void
459irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
460{
461 cpumask_var_t mask;
462
463 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
464 return;
465
466 /*
467 * In case we are out of memory we set IRQTF_AFFINITY again and
468 * try again next time
469 */
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
471 set_bit(IRQTF_AFFINITY, &action->thread_flags);
472 return;
473 }
474
475 spin_lock_irq(&desc->lock);
476 cpumask_copy(mask, desc->affinity);
477 spin_unlock_irq(&desc->lock);
478
479 set_cpus_allowed_ptr(current, mask);
480 free_cpumask_var(mask);
481}
482#else
483static inline void
484irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
485#endif
486
446/* 487/*
447 * Interrupt handler thread 488 * Interrupt handler thread
448 */ 489 */
@@ -458,6 +499,8 @@ static int irq_thread(void *data)
458 499
459 while (!irq_wait_for_interrupt(action)) { 500 while (!irq_wait_for_interrupt(action)) {
460 501
502 irq_thread_check_affinity(desc, action);
503
461 atomic_inc(&desc->threads_active); 504 atomic_inc(&desc->threads_active);
462 505
463 spin_lock_irq(&desc->lock); 506 spin_lock_irq(&desc->lock);
@@ -564,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
564 */ 607 */
565 get_task_struct(t); 608 get_task_struct(t);
566 new->thread = t; 609 new->thread = t;
567 wake_up_process(t);
568 } 610 }
569 611
570 /* 612 /*
@@ -647,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
647 (int)(new->flags & IRQF_TRIGGER_MASK)); 689 (int)(new->flags & IRQF_TRIGGER_MASK));
648 } 690 }
649 691
692 new->irq = irq;
650 *old_ptr = new; 693 *old_ptr = new;
651 694
652 /* Reset broken irq detection when installing new handler */ 695 /* Reset broken irq detection when installing new handler */
@@ -664,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
664 707
665 spin_unlock_irqrestore(&desc->lock, flags); 708 spin_unlock_irqrestore(&desc->lock, flags);
666 709
667 new->irq = irq; 710 /*
711 * Strictly no need to wake it up, but hung_task complains
712 * when no hard interrupt wakes the thread up.
713 */
714 if (new->thread)
715 wake_up_process(new->thread);
716
668 register_irq_proc(irq, desc); 717 register_irq_proc(irq, desc);
669 new->dir = NULL; 718 new->dir = NULL;
670 register_handler_proc(irq, new); 719 register_handler_proc(irq, new);
@@ -718,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
718{ 767{
719 struct irq_desc *desc = irq_to_desc(irq); 768 struct irq_desc *desc = irq_to_desc(irq);
720 struct irqaction *action, **action_ptr; 769 struct irqaction *action, **action_ptr;
721 struct task_struct *irqthread;
722 unsigned long flags; 770 unsigned long flags;
723 771
724 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 772 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -766,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
766 desc->chip->disable(irq); 814 desc->chip->disable(irq);
767 } 815 }
768 816
769 irqthread = action->thread;
770 action->thread = NULL;
771
772 spin_unlock_irqrestore(&desc->lock, flags); 817 spin_unlock_irqrestore(&desc->lock, flags);
773 818
774 unregister_handler_proc(irq, action); 819 unregister_handler_proc(irq, action);
@@ -776,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
776 /* Make sure it's not being used on another CPU: */ 821 /* Make sure it's not being used on another CPU: */
777 synchronize_irq(irq); 822 synchronize_irq(irq);
778 823
779 if (irqthread) {
780 if (!test_bit(IRQTF_DIED, &action->thread_flags))
781 kthread_stop(irqthread);
782 put_task_struct(irqthread);
783 }
784
785#ifdef CONFIG_DEBUG_SHIRQ 824#ifdef CONFIG_DEBUG_SHIRQ
786 /* 825 /*
787 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 826 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -797,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
797 local_irq_restore(flags); 836 local_irq_restore(flags);
798 } 837 }
799#endif 838#endif
839
840 if (action->thread) {
841 if (!test_bit(IRQTF_DIED, &action->thread_flags))
842 kthread_stop(action->thread);
843 put_task_struct(action->thread);
844 }
845
800 return action; 846 return action;
801} 847}
802 848
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
107 107
108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
109{ 109{
110 /* those all static, do move them */ 110 /* those static or target node is -1, do not move them */
111 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
112 return desc; 112 return desc;
113 113
114 if (desc->node != node) 114 if (desc->node != node)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..0540948e29ab 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
237{ 237{
238 struct kprobe_insn_page *kip; 238 struct kprobe_insn_page *kip;
239 struct hlist_node *pos, *next; 239 struct hlist_node *pos, *next;
240 int safety;
241 240
242 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 242 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 243 return -EAGAIN;
248 244
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 694 p->addr = addr;
699 695
700 preempt_disable(); 696 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 697 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 698 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 699 preempt_enable();
704 return -EINVAL; 700 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9b1a7de26979..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind);
180 * @k: thread created by kthread_create(). 180 * @k: thread created by kthread_create().
181 * 181 *
182 * Sets kthread_should_stop() for @k to return true, wakes it, and 182 * Sets kthread_should_stop() for @k to return true, wakes it, and
183 * waits for it to exit. Your threadfn() must not call do_exit() 183 * waits for it to exit. This can also be called after kthread_create()
184 * itself if you use this function! This can also be called after 184 * instead of calling wake_up_process(): the thread will exit without
185 * kthread_create() instead of calling wake_up_process(): the thread 185 * calling threadfn().
186 * will exit without calling threadfn(). 186 *
187 * If threadfn() may call do_exit() itself, the caller must ensure
188 * task_struct can't go away.
187 * 189 *
188 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 190 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
189 * was never called. 191 * was never called.
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..e94caa666dba 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 758 &proc_lockdep_stats_operations);
759 759
760#ifdef CONFIG_LOCK_STAT 760#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 761 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
762 &proc_lock_stat_operations);
762#endif 763#endif
763 764
764 return 0; 765 return 0;
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..2d537186191f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -909,16 +909,18 @@ void __symbol_put(const char *symbol)
909} 909}
910EXPORT_SYMBOL(__symbol_put); 910EXPORT_SYMBOL(__symbol_put);
911 911
912/* Note this assumes addr is a function, which it currently always is. */
912void symbol_put_addr(void *addr) 913void symbol_put_addr(void *addr)
913{ 914{
914 struct module *modaddr; 915 struct module *modaddr;
916 unsigned long a = (unsigned long)dereference_function_descriptor(addr);
915 917
916 if (core_kernel_text((unsigned long)addr)) 918 if (core_kernel_text(a))
917 return; 919 return;
918 920
919 /* module_text_address is safe here: we're supposed to have reference 921 /* module_text_address is safe here: we're supposed to have reference
920 * to module from symbol_get, so it can't go away. */ 922 * to module from symbol_get, so it can't go away. */
921 modaddr = __module_text_address((unsigned long)addr); 923 modaddr = __module_text_address(a);
922 BUG_ON(!modaddr); 924 BUG_ON(!modaddr);
923 module_put(modaddr); 925 module_put(modaddr);
924} 926}
@@ -1068,7 +1070,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1070{
1069 const unsigned long *crc; 1071 const unsigned long *crc;
1070 1072
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1073 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1074 &crc, true, false))
1072 BUG(); 1075 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1076 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1077}
@@ -1271,6 +1274,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1271 struct module_notes_attrs *notes_attrs; 1274 struct module_notes_attrs *notes_attrs;
1272 struct bin_attribute *nattr; 1275 struct bin_attribute *nattr;
1273 1276
1277 /* failed to create section attributes, so can't create notes */
1278 if (!mod->sect_attrs)
1279 return;
1280
1274 /* Count notes sections and allocate structures. */ 1281 /* Count notes sections and allocate structures. */
1275 notes = 0; 1282 notes = 0;
1276 for (i = 0; i < nsect; i++) 1283 for (i = 0; i < nsect; i++)
@@ -2451,9 +2458,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2451 return ret; 2458 return ret;
2452 } 2459 }
2453 if (ret > 0) { 2460 if (ret > 0) {
2454 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2461 printk(KERN_WARNING
2455 "it should follow 0/-E convention\n" 2462"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2456 KERN_WARNING "%s: loading module anyway...\n", 2463"%s: loading module anyway...\n",
2457 __func__, mod->name, ret, 2464 __func__, mod->name, ret,
2458 __func__); 2465 __func__);
2459 dump_stack(); 2466 dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea4..d7cbc579fc80 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
@@ -49,7 +50,7 @@ static atomic_t nr_comm_counters __read_mostly;
49 * 1 - disallow cpu counters to unpriv 50 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv 51 * 2 - disallow kernel profiling to unpriv
51 */ 52 */
52int sysctl_perf_counter_paranoid __read_mostly; 53int sysctl_perf_counter_paranoid __read_mostly = 1;
53 54
54static inline bool perf_paranoid_cpu(void) 55static inline bool perf_paranoid_cpu(void)
55{ 56{
@@ -87,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 88void __weak hw_perf_enable(void) { barrier(); }
88 89
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 92
91int __weak 93int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 94hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -146,6 +148,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 148 }
147} 149}
148 150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
149/* 173/*
150 * Get the perf_counter_context for a task and lock it. 174 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 175 * This has to cope with with the fact that until it is locked,
@@ -236,6 +260,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
236 260
237 list_add_rcu(&counter->event_entry, &ctx->event_list); 261 list_add_rcu(&counter->event_entry, &ctx->event_list);
238 ctx->nr_counters++; 262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
239} 265}
240 266
241/* 267/*
@@ -250,6 +276,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
250 if (list_empty(&counter->list_entry)) 276 if (list_empty(&counter->list_entry))
251 return; 277 return;
252 ctx->nr_counters--; 278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
253 281
254 list_del_init(&counter->list_entry); 282 list_del_init(&counter->list_entry);
255 list_del_rcu(&counter->event_entry); 283 list_del_rcu(&counter->event_entry);
@@ -279,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
279 return; 307 return;
280 308
281 counter->state = PERF_COUNTER_STATE_INACTIVE; 309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
282 counter->tstamp_stopped = ctx->time; 314 counter->tstamp_stopped = ctx->time;
283 counter->pmu->disable(counter); 315 counter->pmu->disable(counter);
284 counter->oncpu = -1; 316 counter->oncpu = -1;
@@ -1006,6 +1038,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1006 && !ctx1->pin_count && !ctx2->pin_count; 1038 && !ctx1->pin_count && !ctx2->pin_count;
1007} 1039}
1008 1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1009/* 1116/*
1010 * Called from scheduler to remove the counters of the current task, 1117 * Called from scheduler to remove the counters of the current task,
1011 * with interrupts disabled. 1118 * with interrupts disabled.
@@ -1061,6 +1168,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1061 ctx->task = next; 1168 ctx->task = next;
1062 next_ctx->task = task; 1169 next_ctx->task = task;
1063 do_switch = 0; 1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1064 } 1173 }
1065 spin_unlock(&next_ctx->lock); 1174 spin_unlock(&next_ctx->lock);
1066 spin_unlock(&ctx->lock); 1175 spin_unlock(&ctx->lock);
@@ -1207,7 +1316,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1207#define MAX_INTERRUPTS (~0ULL) 1316#define MAX_INTERRUPTS (~0ULL)
1208 1317
1209static void perf_log_throttle(struct perf_counter *counter, int enable); 1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1210static void perf_log_period(struct perf_counter *counter, u64 period);
1211 1319
1212static void perf_adjust_period(struct perf_counter *counter, u64 events) 1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1213{ 1321{
@@ -1226,8 +1334,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1226 if (!sample_period) 1334 if (!sample_period)
1227 sample_period = 1; 1335 sample_period = 1;
1228 1336
1229 perf_log_period(counter, sample_period);
1230
1231 hwc->sample_period = sample_period; 1337 hwc->sample_period = sample_period;
1232} 1338}
1233 1339
@@ -1348,14 +1454,70 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1348} 1454}
1349 1455
1350/* 1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1351 * Cross CPU call to read the hardware counter 1502 * Cross CPU call to read the hardware counter
1352 */ 1503 */
1353static void __read(void *info) 1504static void __perf_counter_read(void *info)
1354{ 1505{
1506 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1355 struct perf_counter *counter = info; 1507 struct perf_counter *counter = info;
1356 struct perf_counter_context *ctx = counter->ctx; 1508 struct perf_counter_context *ctx = counter->ctx;
1357 unsigned long flags; 1509 unsigned long flags;
1358 1510
1511 /*
1512 * If this is a task context, we need to check whether it is
1513 * the current task context of this cpu. If not it has been
1514 * scheduled out before the smp call arrived. In that case
1515 * counter->count would have been updated to a recent sample
1516 * when the counter was scheduled out.
1517 */
1518 if (ctx->task && cpuctx->task_ctx != ctx)
1519 return;
1520
1359 local_irq_save(flags); 1521 local_irq_save(flags);
1360 if (ctx->is_active) 1522 if (ctx->is_active)
1361 update_context_time(ctx); 1523 update_context_time(ctx);
@@ -1372,7 +1534,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1372 */ 1534 */
1373 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1535 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1374 smp_call_function_single(counter->oncpu, 1536 smp_call_function_single(counter->oncpu,
1375 __read, counter, 1); 1537 __perf_counter_read, counter, 1);
1376 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1538 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1377 update_counter_times(counter); 1539 update_counter_times(counter);
1378 } 1540 }
@@ -1398,7 +1560,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1398 1560
1399static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1561static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1400{ 1562{
1401 struct perf_counter_context *parent_ctx;
1402 struct perf_counter_context *ctx; 1563 struct perf_counter_context *ctx;
1403 struct perf_cpu_context *cpuctx; 1564 struct perf_cpu_context *cpuctx;
1404 struct task_struct *task; 1565 struct task_struct *task;
@@ -1458,11 +1619,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1458 retry: 1619 retry:
1459 ctx = perf_lock_task_context(task, &flags); 1620 ctx = perf_lock_task_context(task, &flags);
1460 if (ctx) { 1621 if (ctx) {
1461 parent_ctx = ctx->parent_ctx; 1622 unclone_ctx(ctx);
1462 if (parent_ctx) {
1463 put_ctx(parent_ctx);
1464 ctx->parent_ctx = NULL; /* no longer a clone */
1465 }
1466 spin_unlock_irqrestore(&ctx->lock, flags); 1623 spin_unlock_irqrestore(&ctx->lock, flags);
1467 } 1624 }
1468 1625
@@ -1508,11 +1665,15 @@ static void free_counter(struct perf_counter *counter)
1508{ 1665{
1509 perf_pending_sync(counter); 1666 perf_pending_sync(counter);
1510 1667
1511 atomic_dec(&nr_counters); 1668 if (!counter->parent) {
1512 if (counter->attr.mmap) 1669 atomic_dec(&nr_counters);
1513 atomic_dec(&nr_mmap_counters); 1670 if (counter->attr.mmap)
1514 if (counter->attr.comm) 1671 atomic_dec(&nr_mmap_counters);
1515 atomic_dec(&nr_comm_counters); 1672 if (counter->attr.comm)
1673 atomic_dec(&nr_comm_counters);
1674 if (counter->attr.task)
1675 atomic_dec(&nr_task_counters);
1676 }
1516 1677
1517 if (counter->destroy) 1678 if (counter->destroy)
1518 counter->destroy(counter); 1679 counter->destroy(counter);
@@ -1546,14 +1707,133 @@ static int perf_release(struct inode *inode, struct file *file)
1546 return 0; 1707 return 0;
1547} 1708}
1548 1709
1710static int perf_counter_read_size(struct perf_counter *counter)
1711{
1712 int entry = sizeof(u64); /* value */
1713 int size = 0;
1714 int nr = 1;
1715
1716 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1717 size += sizeof(u64);
1718
1719 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1720 size += sizeof(u64);
1721
1722 if (counter->attr.read_format & PERF_FORMAT_ID)
1723 entry += sizeof(u64);
1724
1725 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1726 nr += counter->group_leader->nr_siblings;
1727 size += sizeof(u64);
1728 }
1729
1730 size += entry * nr;
1731
1732 return size;
1733}
1734
1735static u64 perf_counter_read_value(struct perf_counter *counter)
1736{
1737 struct perf_counter *child;
1738 u64 total = 0;
1739
1740 total += perf_counter_read(counter);
1741 list_for_each_entry(child, &counter->child_list, child_list)
1742 total += perf_counter_read(child);
1743
1744 return total;
1745}
1746
1747static int perf_counter_read_entry(struct perf_counter *counter,
1748 u64 read_format, char __user *buf)
1749{
1750 int n = 0, count = 0;
1751 u64 values[2];
1752
1753 values[n++] = perf_counter_read_value(counter);
1754 if (read_format & PERF_FORMAT_ID)
1755 values[n++] = primary_counter_id(counter);
1756
1757 count = n * sizeof(u64);
1758
1759 if (copy_to_user(buf, values, count))
1760 return -EFAULT;
1761
1762 return count;
1763}
1764
1765static int perf_counter_read_group(struct perf_counter *counter,
1766 u64 read_format, char __user *buf)
1767{
1768 struct perf_counter *leader = counter->group_leader, *sub;
1769 int n = 0, size = 0, err = -EFAULT;
1770 u64 values[3];
1771
1772 values[n++] = 1 + leader->nr_siblings;
1773 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1774 values[n++] = leader->total_time_enabled +
1775 atomic64_read(&leader->child_total_time_enabled);
1776 }
1777 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1778 values[n++] = leader->total_time_running +
1779 atomic64_read(&leader->child_total_time_running);
1780 }
1781
1782 size = n * sizeof(u64);
1783
1784 if (copy_to_user(buf, values, size))
1785 return -EFAULT;
1786
1787 err = perf_counter_read_entry(leader, read_format, buf + size);
1788 if (err < 0)
1789 return err;
1790
1791 size += err;
1792
1793 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1794 err = perf_counter_read_entry(sub, read_format,
1795 buf + size);
1796 if (err < 0)
1797 return err;
1798
1799 size += err;
1800 }
1801
1802 return size;
1803}
1804
1805static int perf_counter_read_one(struct perf_counter *counter,
1806 u64 read_format, char __user *buf)
1807{
1808 u64 values[4];
1809 int n = 0;
1810
1811 values[n++] = perf_counter_read_value(counter);
1812 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813 values[n++] = counter->total_time_enabled +
1814 atomic64_read(&counter->child_total_time_enabled);
1815 }
1816 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817 values[n++] = counter->total_time_running +
1818 atomic64_read(&counter->child_total_time_running);
1819 }
1820 if (read_format & PERF_FORMAT_ID)
1821 values[n++] = primary_counter_id(counter);
1822
1823 if (copy_to_user(buf, values, n * sizeof(u64)))
1824 return -EFAULT;
1825
1826 return n * sizeof(u64);
1827}
1828
1549/* 1829/*
1550 * Read the performance counter - simple non blocking version for now 1830 * Read the performance counter - simple non blocking version for now
1551 */ 1831 */
1552static ssize_t 1832static ssize_t
1553perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1833perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1554{ 1834{
1555 u64 values[4]; 1835 u64 read_format = counter->attr.read_format;
1556 int n; 1836 int ret;
1557 1837
1558 /* 1838 /*
1559 * Return end-of-file for a read on a counter that is in 1839 * Return end-of-file for a read on a counter that is in
@@ -1563,28 +1843,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1563 if (counter->state == PERF_COUNTER_STATE_ERROR) 1843 if (counter->state == PERF_COUNTER_STATE_ERROR)
1564 return 0; 1844 return 0;
1565 1845
1846 if (count < perf_counter_read_size(counter))
1847 return -ENOSPC;
1848
1566 WARN_ON_ONCE(counter->ctx->parent_ctx); 1849 WARN_ON_ONCE(counter->ctx->parent_ctx);
1567 mutex_lock(&counter->child_mutex); 1850 mutex_lock(&counter->child_mutex);
1568 values[0] = perf_counter_read(counter); 1851 if (read_format & PERF_FORMAT_GROUP)
1569 n = 1; 1852 ret = perf_counter_read_group(counter, read_format, buf);
1570 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1853 else
1571 values[n++] = counter->total_time_enabled + 1854 ret = perf_counter_read_one(counter, read_format, buf);
1572 atomic64_read(&counter->child_total_time_enabled);
1573 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1574 values[n++] = counter->total_time_running +
1575 atomic64_read(&counter->child_total_time_running);
1576 if (counter->attr.read_format & PERF_FORMAT_ID)
1577 values[n++] = counter->id;
1578 mutex_unlock(&counter->child_mutex); 1855 mutex_unlock(&counter->child_mutex);
1579 1856
1580 if (count < n * sizeof(u64)) 1857 return ret;
1581 return -EINVAL;
1582 count = n * sizeof(u64);
1583
1584 if (copy_to_user(buf, values, count))
1585 return -EFAULT;
1586
1587 return count;
1588} 1858}
1589 1859
1590static ssize_t 1860static ssize_t
@@ -1681,8 +1951,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1681 1951
1682 counter->attr.sample_freq = value; 1952 counter->attr.sample_freq = value;
1683 } else { 1953 } else {
1684 perf_log_period(counter, value);
1685
1686 counter->attr.sample_period = value; 1954 counter->attr.sample_period = value;
1687 counter->hw.sample_period = value; 1955 counter->hw.sample_period = value;
1688 } 1956 }
@@ -1751,6 +2019,18 @@ int perf_counter_task_disable(void)
1751 return 0; 2019 return 0;
1752} 2020}
1753 2021
2022#ifndef PERF_COUNTER_INDEX_OFFSET
2023# define PERF_COUNTER_INDEX_OFFSET 0
2024#endif
2025
2026static int perf_counter_index(struct perf_counter *counter)
2027{
2028 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2029 return 0;
2030
2031 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2032}
2033
1754/* 2034/*
1755 * Callers need to ensure there can be no nesting of this function, otherwise 2035 * Callers need to ensure there can be no nesting of this function, otherwise
1756 * the seqlock logic goes bad. We can not serialize this because the arch 2036 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1775,11 +2055,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1775 preempt_disable(); 2055 preempt_disable();
1776 ++userpg->lock; 2056 ++userpg->lock;
1777 barrier(); 2057 barrier();
1778 userpg->index = counter->hw.idx; 2058 userpg->index = perf_counter_index(counter);
1779 userpg->offset = atomic64_read(&counter->count); 2059 userpg->offset = atomic64_read(&counter->count);
1780 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 2060 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1781 userpg->offset -= atomic64_read(&counter->hw.prev_count); 2061 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1782 2062
2063 userpg->time_enabled = counter->total_time_enabled +
2064 atomic64_read(&counter->child_total_time_enabled);
2065
2066 userpg->time_running = counter->total_time_running +
2067 atomic64_read(&counter->child_total_time_running);
2068
1783 barrier(); 2069 barrier();
1784 ++userpg->lock; 2070 ++userpg->lock;
1785 preempt_enable(); 2071 preempt_enable();
@@ -1876,7 +2162,7 @@ fail:
1876 2162
1877static void perf_mmap_free_page(unsigned long addr) 2163static void perf_mmap_free_page(unsigned long addr)
1878{ 2164{
1879 struct page *page = virt_to_page(addr); 2165 struct page *page = virt_to_page((void *)addr);
1880 2166
1881 page->mapping = NULL; 2167 page->mapping = NULL;
1882 __free_page(page); 2168 __free_page(page);
@@ -2076,7 +2362,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2076 2362
2077 if (counter->pending_disable) { 2363 if (counter->pending_disable) {
2078 counter->pending_disable = 0; 2364 counter->pending_disable = 0;
2079 perf_counter_disable(counter); 2365 __perf_counter_disable(counter);
2080 } 2366 }
2081 2367
2082 if (counter->pending_wakeup) { 2368 if (counter->pending_wakeup) {
@@ -2461,7 +2747,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2461 return task_pid_nr_ns(p, counter->ns); 2747 return task_pid_nr_ns(p, counter->ns);
2462} 2748}
2463 2749
2464static void perf_counter_output(struct perf_counter *counter, int nmi, 2750static void perf_output_read_one(struct perf_output_handle *handle,
2751 struct perf_counter *counter)
2752{
2753 u64 read_format = counter->attr.read_format;
2754 u64 values[4];
2755 int n = 0;
2756
2757 values[n++] = atomic64_read(&counter->count);
2758 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2759 values[n++] = counter->total_time_enabled +
2760 atomic64_read(&counter->child_total_time_enabled);
2761 }
2762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2763 values[n++] = counter->total_time_running +
2764 atomic64_read(&counter->child_total_time_running);
2765 }
2766 if (read_format & PERF_FORMAT_ID)
2767 values[n++] = primary_counter_id(counter);
2768
2769 perf_output_copy(handle, values, n * sizeof(u64));
2770}
2771
2772/*
2773 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2774 */
2775static void perf_output_read_group(struct perf_output_handle *handle,
2776 struct perf_counter *counter)
2777{
2778 struct perf_counter *leader = counter->group_leader, *sub;
2779 u64 read_format = counter->attr.read_format;
2780 u64 values[5];
2781 int n = 0;
2782
2783 values[n++] = 1 + leader->nr_siblings;
2784
2785 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2786 values[n++] = leader->total_time_enabled;
2787
2788 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2789 values[n++] = leader->total_time_running;
2790
2791 if (leader != counter)
2792 leader->pmu->read(leader);
2793
2794 values[n++] = atomic64_read(&leader->count);
2795 if (read_format & PERF_FORMAT_ID)
2796 values[n++] = primary_counter_id(leader);
2797
2798 perf_output_copy(handle, values, n * sizeof(u64));
2799
2800 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2801 n = 0;
2802
2803 if (sub != counter)
2804 sub->pmu->read(sub);
2805
2806 values[n++] = atomic64_read(&sub->count);
2807 if (read_format & PERF_FORMAT_ID)
2808 values[n++] = primary_counter_id(sub);
2809
2810 perf_output_copy(handle, values, n * sizeof(u64));
2811 }
2812}
2813
2814static void perf_output_read(struct perf_output_handle *handle,
2815 struct perf_counter *counter)
2816{
2817 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2818 perf_output_read_group(handle, counter);
2819 else
2820 perf_output_read_one(handle, counter);
2821}
2822
2823void perf_counter_output(struct perf_counter *counter, int nmi,
2465 struct perf_sample_data *data) 2824 struct perf_sample_data *data)
2466{ 2825{
2467 int ret; 2826 int ret;
@@ -2472,10 +2831,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2472 struct { 2831 struct {
2473 u32 pid, tid; 2832 u32 pid, tid;
2474 } tid_entry; 2833 } tid_entry;
2475 struct {
2476 u64 id;
2477 u64 counter;
2478 } group_entry;
2479 struct perf_callchain_entry *callchain = NULL; 2834 struct perf_callchain_entry *callchain = NULL;
2480 int callchain_size = 0; 2835 int callchain_size = 0;
2481 u64 time; 2836 u64 time;
@@ -2483,15 +2838,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2483 u32 cpu, reserved; 2838 u32 cpu, reserved;
2484 } cpu_entry; 2839 } cpu_entry;
2485 2840
2486 header.type = 0; 2841 header.type = PERF_EVENT_SAMPLE;
2487 header.size = sizeof(header); 2842 header.size = sizeof(header);
2488 2843
2489 header.misc = PERF_EVENT_MISC_OVERFLOW; 2844 header.misc = 0;
2490 header.misc |= perf_misc_flags(data->regs); 2845 header.misc |= perf_misc_flags(data->regs);
2491 2846
2492 if (sample_type & PERF_SAMPLE_IP) { 2847 if (sample_type & PERF_SAMPLE_IP) {
2493 ip = perf_instruction_pointer(data->regs); 2848 ip = perf_instruction_pointer(data->regs);
2494 header.type |= PERF_SAMPLE_IP;
2495 header.size += sizeof(ip); 2849 header.size += sizeof(ip);
2496 } 2850 }
2497 2851
@@ -2500,7 +2854,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2500 tid_entry.pid = perf_counter_pid(counter, current); 2854 tid_entry.pid = perf_counter_pid(counter, current);
2501 tid_entry.tid = perf_counter_tid(counter, current); 2855 tid_entry.tid = perf_counter_tid(counter, current);
2502 2856
2503 header.type |= PERF_SAMPLE_TID;
2504 header.size += sizeof(tid_entry); 2857 header.size += sizeof(tid_entry);
2505 } 2858 }
2506 2859
@@ -2510,47 +2863,51 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2510 */ 2863 */
2511 time = sched_clock(); 2864 time = sched_clock();
2512 2865
2513 header.type |= PERF_SAMPLE_TIME;
2514 header.size += sizeof(u64); 2866 header.size += sizeof(u64);
2515 } 2867 }
2516 2868
2517 if (sample_type & PERF_SAMPLE_ADDR) { 2869 if (sample_type & PERF_SAMPLE_ADDR)
2518 header.type |= PERF_SAMPLE_ADDR;
2519 header.size += sizeof(u64); 2870 header.size += sizeof(u64);
2520 }
2521 2871
2522 if (sample_type & PERF_SAMPLE_ID) { 2872 if (sample_type & PERF_SAMPLE_ID)
2523 header.type |= PERF_SAMPLE_ID; 2873 header.size += sizeof(u64);
2874
2875 if (sample_type & PERF_SAMPLE_STREAM_ID)
2524 header.size += sizeof(u64); 2876 header.size += sizeof(u64);
2525 }
2526 2877
2527 if (sample_type & PERF_SAMPLE_CPU) { 2878 if (sample_type & PERF_SAMPLE_CPU) {
2528 header.type |= PERF_SAMPLE_CPU;
2529 header.size += sizeof(cpu_entry); 2879 header.size += sizeof(cpu_entry);
2530 2880
2531 cpu_entry.cpu = raw_smp_processor_id(); 2881 cpu_entry.cpu = raw_smp_processor_id();
2882 cpu_entry.reserved = 0;
2532 } 2883 }
2533 2884
2534 if (sample_type & PERF_SAMPLE_PERIOD) { 2885 if (sample_type & PERF_SAMPLE_PERIOD)
2535 header.type |= PERF_SAMPLE_PERIOD;
2536 header.size += sizeof(u64); 2886 header.size += sizeof(u64);
2537 }
2538 2887
2539 if (sample_type & PERF_SAMPLE_GROUP) { 2888 if (sample_type & PERF_SAMPLE_READ)
2540 header.type |= PERF_SAMPLE_GROUP; 2889 header.size += perf_counter_read_size(counter);
2541 header.size += sizeof(u64) +
2542 counter->nr_siblings * sizeof(group_entry);
2543 }
2544 2890
2545 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2891 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2546 callchain = perf_callchain(data->regs); 2892 callchain = perf_callchain(data->regs);
2547 2893
2548 if (callchain) { 2894 if (callchain) {
2549 callchain_size = (1 + callchain->nr) * sizeof(u64); 2895 callchain_size = (1 + callchain->nr) * sizeof(u64);
2550
2551 header.type |= PERF_SAMPLE_CALLCHAIN;
2552 header.size += callchain_size; 2896 header.size += callchain_size;
2553 } 2897 } else
2898 header.size += sizeof(u64);
2899 }
2900
2901 if (sample_type & PERF_SAMPLE_RAW) {
2902 int size = sizeof(u32);
2903
2904 if (data->raw)
2905 size += data->raw->size;
2906 else
2907 size += sizeof(u32);
2908
2909 WARN_ON_ONCE(size & (sizeof(u64)-1));
2910 header.size += size;
2554 } 2911 }
2555 2912
2556 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2913 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2571,7 +2928,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2571 if (sample_type & PERF_SAMPLE_ADDR) 2928 if (sample_type & PERF_SAMPLE_ADDR)
2572 perf_output_put(&handle, data->addr); 2929 perf_output_put(&handle, data->addr);
2573 2930
2574 if (sample_type & PERF_SAMPLE_ID) 2931 if (sample_type & PERF_SAMPLE_ID) {
2932 u64 id = primary_counter_id(counter);
2933
2934 perf_output_put(&handle, id);
2935 }
2936
2937 if (sample_type & PERF_SAMPLE_STREAM_ID)
2575 perf_output_put(&handle, counter->id); 2938 perf_output_put(&handle, counter->id);
2576 2939
2577 if (sample_type & PERF_SAMPLE_CPU) 2940 if (sample_type & PERF_SAMPLE_CPU)
@@ -2580,76 +2943,125 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2580 if (sample_type & PERF_SAMPLE_PERIOD) 2943 if (sample_type & PERF_SAMPLE_PERIOD)
2581 perf_output_put(&handle, data->period); 2944 perf_output_put(&handle, data->period);
2582 2945
2583 /* 2946 if (sample_type & PERF_SAMPLE_READ)
2584 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2947 perf_output_read(&handle, counter);
2585 */
2586 if (sample_type & PERF_SAMPLE_GROUP) {
2587 struct perf_counter *leader, *sub;
2588 u64 nr = counter->nr_siblings;
2589
2590 perf_output_put(&handle, nr);
2591 2948
2592 leader = counter->group_leader; 2949 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2593 list_for_each_entry(sub, &leader->sibling_list, list_entry) { 2950 if (callchain)
2594 if (sub != counter) 2951 perf_output_copy(&handle, callchain, callchain_size);
2595 sub->pmu->read(sub); 2952 else {
2596 2953 u64 nr = 0;
2597 group_entry.id = sub->id; 2954 perf_output_put(&handle, nr);
2598 group_entry.counter = atomic64_read(&sub->count); 2955 }
2956 }
2599 2957
2600 perf_output_put(&handle, group_entry); 2958 if (sample_type & PERF_SAMPLE_RAW) {
2959 if (data->raw) {
2960 perf_output_put(&handle, data->raw->size);
2961 perf_output_copy(&handle, data->raw->data, data->raw->size);
2962 } else {
2963 struct {
2964 u32 size;
2965 u32 data;
2966 } raw = {
2967 .size = sizeof(u32),
2968 .data = 0,
2969 };
2970 perf_output_put(&handle, raw);
2601 } 2971 }
2602 } 2972 }
2603 2973
2604 if (callchain) 2974 perf_output_end(&handle);
2605 perf_output_copy(&handle, callchain, callchain_size); 2975}
2976
2977/*
2978 * read event
2979 */
2980
2981struct perf_read_event {
2982 struct perf_event_header header;
2983
2984 u32 pid;
2985 u32 tid;
2986};
2987
2988static void
2989perf_counter_read_event(struct perf_counter *counter,
2990 struct task_struct *task)
2991{
2992 struct perf_output_handle handle;
2993 struct perf_read_event event = {
2994 .header = {
2995 .type = PERF_EVENT_READ,
2996 .misc = 0,
2997 .size = sizeof(event) + perf_counter_read_size(counter),
2998 },
2999 .pid = perf_counter_pid(counter, task),
3000 .tid = perf_counter_tid(counter, task),
3001 };
3002 int ret;
3003
3004 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3005 if (ret)
3006 return;
3007
3008 perf_output_put(&handle, event);
3009 perf_output_read(&handle, counter);
2606 3010
2607 perf_output_end(&handle); 3011 perf_output_end(&handle);
2608} 3012}
2609 3013
2610/* 3014/*
2611 * fork tracking 3015 * task tracking -- fork/exit
3016 *
3017 * enabled by: attr.comm | attr.mmap | attr.task
2612 */ 3018 */
2613 3019
2614struct perf_fork_event { 3020struct perf_task_event {
2615 struct task_struct *task; 3021 struct task_struct *task;
3022 struct perf_counter_context *task_ctx;
2616 3023
2617 struct { 3024 struct {
2618 struct perf_event_header header; 3025 struct perf_event_header header;
2619 3026
2620 u32 pid; 3027 u32 pid;
2621 u32 ppid; 3028 u32 ppid;
3029 u32 tid;
3030 u32 ptid;
2622 } event; 3031 } event;
2623}; 3032};
2624 3033
2625static void perf_counter_fork_output(struct perf_counter *counter, 3034static void perf_counter_task_output(struct perf_counter *counter,
2626 struct perf_fork_event *fork_event) 3035 struct perf_task_event *task_event)
2627{ 3036{
2628 struct perf_output_handle handle; 3037 struct perf_output_handle handle;
2629 int size = fork_event->event.header.size; 3038 int size = task_event->event.header.size;
2630 struct task_struct *task = fork_event->task; 3039 struct task_struct *task = task_event->task;
2631 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3040 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2632 3041
2633 if (ret) 3042 if (ret)
2634 return; 3043 return;
2635 3044
2636 fork_event->event.pid = perf_counter_pid(counter, task); 3045 task_event->event.pid = perf_counter_pid(counter, task);
2637 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3046 task_event->event.ppid = perf_counter_pid(counter, current);
3047
3048 task_event->event.tid = perf_counter_tid(counter, task);
3049 task_event->event.ptid = perf_counter_tid(counter, current);
2638 3050
2639 perf_output_put(&handle, fork_event->event); 3051 perf_output_put(&handle, task_event->event);
2640 perf_output_end(&handle); 3052 perf_output_end(&handle);
2641} 3053}
2642 3054
2643static int perf_counter_fork_match(struct perf_counter *counter) 3055static int perf_counter_task_match(struct perf_counter *counter)
2644{ 3056{
2645 if (counter->attr.comm || counter->attr.mmap) 3057 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2646 return 1; 3058 return 1;
2647 3059
2648 return 0; 3060 return 0;
2649} 3061}
2650 3062
2651static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3063static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2652 struct perf_fork_event *fork_event) 3064 struct perf_task_event *task_event)
2653{ 3065{
2654 struct perf_counter *counter; 3066 struct perf_counter *counter;
2655 3067
@@ -2658,51 +3070,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2658 3070
2659 rcu_read_lock(); 3071 rcu_read_lock();
2660 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3072 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2661 if (perf_counter_fork_match(counter)) 3073 if (perf_counter_task_match(counter))
2662 perf_counter_fork_output(counter, fork_event); 3074 perf_counter_task_output(counter, task_event);
2663 } 3075 }
2664 rcu_read_unlock(); 3076 rcu_read_unlock();
2665} 3077}
2666 3078
2667static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3079static void perf_counter_task_event(struct perf_task_event *task_event)
2668{ 3080{
2669 struct perf_cpu_context *cpuctx; 3081 struct perf_cpu_context *cpuctx;
2670 struct perf_counter_context *ctx; 3082 struct perf_counter_context *ctx = task_event->task_ctx;
2671 3083
2672 cpuctx = &get_cpu_var(perf_cpu_context); 3084 cpuctx = &get_cpu_var(perf_cpu_context);
2673 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3085 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2674 put_cpu_var(perf_cpu_context); 3086 put_cpu_var(perf_cpu_context);
2675 3087
2676 rcu_read_lock(); 3088 rcu_read_lock();
2677 /* 3089 if (!ctx)
2678 * doesn't really matter which of the child contexts the 3090 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2679 * events ends up in.
2680 */
2681 ctx = rcu_dereference(current->perf_counter_ctxp);
2682 if (ctx) 3091 if (ctx)
2683 perf_counter_fork_ctx(ctx, fork_event); 3092 perf_counter_task_ctx(ctx, task_event);
2684 rcu_read_unlock(); 3093 rcu_read_unlock();
2685} 3094}
2686 3095
2687void perf_counter_fork(struct task_struct *task) 3096static void perf_counter_task(struct task_struct *task,
3097 struct perf_counter_context *task_ctx,
3098 int new)
2688{ 3099{
2689 struct perf_fork_event fork_event; 3100 struct perf_task_event task_event;
2690 3101
2691 if (!atomic_read(&nr_comm_counters) && 3102 if (!atomic_read(&nr_comm_counters) &&
2692 !atomic_read(&nr_mmap_counters)) 3103 !atomic_read(&nr_mmap_counters) &&
3104 !atomic_read(&nr_task_counters))
2693 return; 3105 return;
2694 3106
2695 fork_event = (struct perf_fork_event){ 3107 task_event = (struct perf_task_event){
2696 .task = task, 3108 .task = task,
2697 .event = { 3109 .task_ctx = task_ctx,
3110 .event = {
2698 .header = { 3111 .header = {
2699 .type = PERF_EVENT_FORK, 3112 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2700 .size = sizeof(fork_event.event), 3113 .misc = 0,
3114 .size = sizeof(task_event.event),
2701 }, 3115 },
3116 /* .pid */
3117 /* .ppid */
3118 /* .tid */
3119 /* .ptid */
2702 }, 3120 },
2703 }; 3121 };
2704 3122
2705 perf_counter_fork_event(&fork_event); 3123 perf_counter_task_event(&task_event);
3124}
3125
3126void perf_counter_fork(struct task_struct *task)
3127{
3128 perf_counter_task(task, NULL, 1);
2706} 3129}
2707 3130
2708/* 3131/*
@@ -2770,8 +3193,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2770 struct perf_cpu_context *cpuctx; 3193 struct perf_cpu_context *cpuctx;
2771 struct perf_counter_context *ctx; 3194 struct perf_counter_context *ctx;
2772 unsigned int size; 3195 unsigned int size;
2773 char *comm = comm_event->task->comm; 3196 char comm[TASK_COMM_LEN];
2774 3197
3198 memset(comm, 0, sizeof(comm));
3199 strncpy(comm, comm_event->task->comm, sizeof(comm));
2775 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3200 size = ALIGN(strlen(comm)+1, sizeof(u64));
2776 3201
2777 comm_event->comm = comm; 3202 comm_event->comm = comm;
@@ -2798,13 +3223,24 @@ void perf_counter_comm(struct task_struct *task)
2798{ 3223{
2799 struct perf_comm_event comm_event; 3224 struct perf_comm_event comm_event;
2800 3225
3226 if (task->perf_counter_ctxp)
3227 perf_counter_enable_on_exec(task);
3228
2801 if (!atomic_read(&nr_comm_counters)) 3229 if (!atomic_read(&nr_comm_counters))
2802 return; 3230 return;
2803 3231
2804 comm_event = (struct perf_comm_event){ 3232 comm_event = (struct perf_comm_event){
2805 .task = task, 3233 .task = task,
3234 /* .comm */
3235 /* .comm_size */
2806 .event = { 3236 .event = {
2807 .header = { .type = PERF_EVENT_COMM, }, 3237 .header = {
3238 .type = PERF_EVENT_COMM,
3239 .misc = 0,
3240 /* .size */
3241 },
3242 /* .pid */
3243 /* .tid */
2808 }, 3244 },
2809 }; 3245 };
2810 3246
@@ -2887,8 +3323,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2887 char *buf = NULL; 3323 char *buf = NULL;
2888 const char *name; 3324 const char *name;
2889 3325
3326 memset(tmp, 0, sizeof(tmp));
3327
2890 if (file) { 3328 if (file) {
2891 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3329 /*
3330 * d_path works from the end of the buffer backwards, so we
3331 * need to add enough zero bytes after the string to handle
3332 * the 64bit alignment we do later.
3333 */
3334 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
2892 if (!buf) { 3335 if (!buf) {
2893 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3336 name = strncpy(tmp, "//enomem", sizeof(tmp));
2894 goto got_name; 3337 goto got_name;
@@ -2899,9 +3342,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2899 goto got_name; 3342 goto got_name;
2900 } 3343 }
2901 } else { 3344 } else {
2902 name = arch_vma_name(mmap_event->vma); 3345 if (arch_vma_name(mmap_event->vma)) {
2903 if (name) 3346 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3347 sizeof(tmp));
2904 goto got_name; 3348 goto got_name;
3349 }
2905 3350
2906 if (!vma->vm_mm) { 3351 if (!vma->vm_mm) {
2907 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3352 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -2946,8 +3391,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2946 3391
2947 mmap_event = (struct perf_mmap_event){ 3392 mmap_event = (struct perf_mmap_event){
2948 .vma = vma, 3393 .vma = vma,
3394 /* .file_name */
3395 /* .file_size */
2949 .event = { 3396 .event = {
2950 .header = { .type = PERF_EVENT_MMAP, }, 3397 .header = {
3398 .type = PERF_EVENT_MMAP,
3399 .misc = 0,
3400 /* .size */
3401 },
3402 /* .pid */
3403 /* .tid */
2951 .start = vma->vm_start, 3404 .start = vma->vm_start,
2952 .len = vma->vm_end - vma->vm_start, 3405 .len = vma->vm_end - vma->vm_start,
2953 .pgoff = vma->vm_pgoff, 3406 .pgoff = vma->vm_pgoff,
@@ -2958,49 +3411,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2958} 3411}
2959 3412
2960/* 3413/*
2961 * Log sample_period changes so that analyzing tools can re-normalize the
2962 * event flow.
2963 */
2964
2965struct freq_event {
2966 struct perf_event_header header;
2967 u64 time;
2968 u64 id;
2969 u64 period;
2970};
2971
2972static void perf_log_period(struct perf_counter *counter, u64 period)
2973{
2974 struct perf_output_handle handle;
2975 struct freq_event event;
2976 int ret;
2977
2978 if (counter->hw.sample_period == period)
2979 return;
2980
2981 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2982 return;
2983
2984 event = (struct freq_event) {
2985 .header = {
2986 .type = PERF_EVENT_PERIOD,
2987 .misc = 0,
2988 .size = sizeof(event),
2989 },
2990 .time = sched_clock(),
2991 .id = counter->id,
2992 .period = period,
2993 };
2994
2995 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2996 if (ret)
2997 return;
2998
2999 perf_output_put(&handle, event);
3000 perf_output_end(&handle);
3001}
3002
3003/*
3004 * IRQ throttle logging 3414 * IRQ throttle logging
3005 */ 3415 */
3006 3416
@@ -3013,16 +3423,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3013 struct perf_event_header header; 3423 struct perf_event_header header;
3014 u64 time; 3424 u64 time;
3015 u64 id; 3425 u64 id;
3426 u64 stream_id;
3016 } throttle_event = { 3427 } throttle_event = {
3017 .header = { 3428 .header = {
3018 .type = PERF_EVENT_THROTTLE + 1, 3429 .type = PERF_EVENT_THROTTLE,
3019 .misc = 0, 3430 .misc = 0,
3020 .size = sizeof(throttle_event), 3431 .size = sizeof(throttle_event),
3021 }, 3432 },
3022 .time = sched_clock(), 3433 .time = sched_clock(),
3023 .id = counter->id, 3434 .id = primary_counter_id(counter),
3435 .stream_id = counter->id,
3024 }; 3436 };
3025 3437
3438 if (enable)
3439 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3440
3026 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3441 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3027 if (ret) 3442 if (ret)
3028 return; 3443 return;
@@ -3099,125 +3514,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3099 * Generic software counter infrastructure 3514 * Generic software counter infrastructure
3100 */ 3515 */
3101 3516
3102static void perf_swcounter_update(struct perf_counter *counter) 3517/*
3518 * We directly increment counter->count and keep a second value in
3519 * counter->hw.period_left to count intervals. This period counter
3520 * is kept in the range [-sample_period, 0] so that we can use the
3521 * sign as trigger.
3522 */
3523
3524static u64 perf_swcounter_set_period(struct perf_counter *counter)
3103{ 3525{
3104 struct hw_perf_counter *hwc = &counter->hw; 3526 struct hw_perf_counter *hwc = &counter->hw;
3105 u64 prev, now; 3527 u64 period = hwc->last_period;
3106 s64 delta; 3528 u64 nr, offset;
3529 s64 old, val;
3530
3531 hwc->last_period = hwc->sample_period;
3107 3532
3108again: 3533again:
3109 prev = atomic64_read(&hwc->prev_count); 3534 old = val = atomic64_read(&hwc->period_left);
3110 now = atomic64_read(&hwc->count); 3535 if (val < 0)
3111 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3536 return 0;
3112 goto again;
3113 3537
3114 delta = now - prev; 3538 nr = div64_u64(period + val, period);
3539 offset = nr * period;
3540 val -= offset;
3541 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3542 goto again;
3115 3543
3116 atomic64_add(delta, &counter->count); 3544 return nr;
3117 atomic64_sub(delta, &hwc->period_left);
3118} 3545}
3119 3546
3120static void perf_swcounter_set_period(struct perf_counter *counter) 3547static void perf_swcounter_overflow(struct perf_counter *counter,
3548 int nmi, struct perf_sample_data *data)
3121{ 3549{
3122 struct hw_perf_counter *hwc = &counter->hw; 3550 struct hw_perf_counter *hwc = &counter->hw;
3123 s64 left = atomic64_read(&hwc->period_left); 3551 u64 overflow;
3124 s64 period = hwc->sample_period;
3125 3552
3126 if (unlikely(left <= -period)) { 3553 data->period = counter->hw.last_period;
3127 left = period; 3554 overflow = perf_swcounter_set_period(counter);
3128 atomic64_set(&hwc->period_left, left);
3129 hwc->last_period = period;
3130 }
3131 3555
3132 if (unlikely(left <= 0)) { 3556 if (hwc->interrupts == MAX_INTERRUPTS)
3133 left += period; 3557 return;
3134 atomic64_add(period, &hwc->period_left);
3135 hwc->last_period = period;
3136 }
3137 3558
3138 atomic64_set(&hwc->prev_count, -left); 3559 for (; overflow; overflow--) {
3139 atomic64_set(&hwc->count, -left); 3560 if (perf_counter_overflow(counter, nmi, data)) {
3561 /*
3562 * We inhibit the overflow from happening when
3563 * hwc->interrupts == MAX_INTERRUPTS.
3564 */
3565 break;
3566 }
3567 }
3140} 3568}
3141 3569
3142static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3570static void perf_swcounter_unthrottle(struct perf_counter *counter)
3143{ 3571{
3144 enum hrtimer_restart ret = HRTIMER_RESTART;
3145 struct perf_sample_data data;
3146 struct perf_counter *counter;
3147 u64 period;
3148
3149 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3150 counter->pmu->read(counter);
3151
3152 data.addr = 0;
3153 data.regs = get_irq_regs();
3154 /* 3572 /*
3155 * In case we exclude kernel IPs or are somehow not in interrupt 3573 * Nothing to do, we already reset hwc->interrupts.
3156 * context, provide the next best thing, the user IP.
3157 */ 3574 */
3158 if ((counter->attr.exclude_kernel || !data.regs) && 3575}
3159 !counter->attr.exclude_user)
3160 data.regs = task_pt_regs(current);
3161 3576
3162 if (data.regs) { 3577static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3163 if (perf_counter_overflow(counter, 0, &data)) 3578 int nmi, struct perf_sample_data *data)
3164 ret = HRTIMER_NORESTART; 3579{
3165 } 3580 struct hw_perf_counter *hwc = &counter->hw;
3166 3581
3167 period = max_t(u64, 10000, counter->hw.sample_period); 3582 atomic64_add(nr, &counter->count);
3168 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3169 3583
3170 return ret; 3584 if (!hwc->sample_period)
3171} 3585 return;
3172 3586
3173static void perf_swcounter_overflow(struct perf_counter *counter, 3587 if (!data->regs)
3174 int nmi, struct perf_sample_data *data) 3588 return;
3175{
3176 data->period = counter->hw.last_period;
3177 3589
3178 perf_swcounter_update(counter); 3590 if (!atomic64_add_negative(nr, &hwc->period_left))
3179 perf_swcounter_set_period(counter); 3591 perf_swcounter_overflow(counter, nmi, data);
3180 if (perf_counter_overflow(counter, nmi, data))
3181 /* soft-disable the counter */
3182 ;
3183} 3592}
3184 3593
3185static int perf_swcounter_is_counting(struct perf_counter *counter) 3594static int perf_swcounter_is_counting(struct perf_counter *counter)
3186{ 3595{
3187 struct perf_counter_context *ctx; 3596 /*
3188 unsigned long flags; 3597 * The counter is active, we're good!
3189 int count; 3598 */
3190
3191 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3599 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3192 return 1; 3600 return 1;
3193 3601
3602 /*
3603 * The counter is off/error, not counting.
3604 */
3194 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3605 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3195 return 0; 3606 return 0;
3196 3607
3197 /* 3608 /*
3198 * If the counter is inactive, it could be just because 3609 * The counter is inactive, if the context is active
3199 * its task is scheduled out, or because it's in a group 3610 * we're part of a group that didn't make it on the 'pmu',
3200 * which could not go on the PMU. We want to count in 3611 * not counting.
3201 * the first case but not the second. If the context is
3202 * currently active then an inactive software counter must
3203 * be the second case. If it's not currently active then
3204 * we need to know whether the counter was active when the
3205 * context was last active, which we can determine by
3206 * comparing counter->tstamp_stopped with ctx->time.
3207 *
3208 * We are within an RCU read-side critical section,
3209 * which protects the existence of *ctx.
3210 */ 3612 */
3211 ctx = counter->ctx; 3613 if (counter->ctx->is_active)
3212 spin_lock_irqsave(&ctx->lock, flags); 3614 return 0;
3213 count = 1; 3615
3214 /* Re-check state now we have the lock */ 3616 /*
3215 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3617 * We're inactive and the context is too, this means the
3216 counter->ctx->is_active || 3618 * task is scheduled out, we're counting events that happen
3217 counter->tstamp_stopped < ctx->time) 3619 * to us, like migration events.
3218 count = 0; 3620 */
3219 spin_unlock_irqrestore(&ctx->lock, flags); 3621 return 1;
3220 return count;
3221} 3622}
3222 3623
3223static int perf_swcounter_match(struct perf_counter *counter, 3624static int perf_swcounter_match(struct perf_counter *counter,
@@ -3243,15 +3644,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
3243 return 1; 3644 return 1;
3244} 3645}
3245 3646
3246static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3247 int nmi, struct perf_sample_data *data)
3248{
3249 int neg = atomic64_add_negative(nr, &counter->hw.count);
3250
3251 if (counter->hw.sample_period && !neg && data->regs)
3252 perf_swcounter_overflow(counter, nmi, data);
3253}
3254
3255static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3647static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3256 enum perf_type_id type, 3648 enum perf_type_id type,
3257 u32 event, u64 nr, int nmi, 3649 u32 event, u64 nr, int nmi,
@@ -3317,8 +3709,8 @@ out:
3317 put_cpu_var(perf_cpu_context); 3709 put_cpu_var(perf_cpu_context);
3318} 3710}
3319 3711
3320void 3712void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3321perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3713 struct pt_regs *regs, u64 addr)
3322{ 3714{
3323 struct perf_sample_data data = { 3715 struct perf_sample_data data = {
3324 .regs = regs, 3716 .regs = regs,
@@ -3330,27 +3722,66 @@ perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3330 3722
3331static void perf_swcounter_read(struct perf_counter *counter) 3723static void perf_swcounter_read(struct perf_counter *counter)
3332{ 3724{
3333 perf_swcounter_update(counter);
3334} 3725}
3335 3726
3336static int perf_swcounter_enable(struct perf_counter *counter) 3727static int perf_swcounter_enable(struct perf_counter *counter)
3337{ 3728{
3338 perf_swcounter_set_period(counter); 3729 struct hw_perf_counter *hwc = &counter->hw;
3730
3731 if (hwc->sample_period) {
3732 hwc->last_period = hwc->sample_period;
3733 perf_swcounter_set_period(counter);
3734 }
3339 return 0; 3735 return 0;
3340} 3736}
3341 3737
3342static void perf_swcounter_disable(struct perf_counter *counter) 3738static void perf_swcounter_disable(struct perf_counter *counter)
3343{ 3739{
3344 perf_swcounter_update(counter);
3345} 3740}
3346 3741
3347static const struct pmu perf_ops_generic = { 3742static const struct pmu perf_ops_generic = {
3348 .enable = perf_swcounter_enable, 3743 .enable = perf_swcounter_enable,
3349 .disable = perf_swcounter_disable, 3744 .disable = perf_swcounter_disable,
3350 .read = perf_swcounter_read, 3745 .read = perf_swcounter_read,
3746 .unthrottle = perf_swcounter_unthrottle,
3351}; 3747};
3352 3748
3353/* 3749/*
3750 * hrtimer based swcounter callback
3751 */
3752
3753static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3754{
3755 enum hrtimer_restart ret = HRTIMER_RESTART;
3756 struct perf_sample_data data;
3757 struct perf_counter *counter;
3758 u64 period;
3759
3760 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3761 counter->pmu->read(counter);
3762
3763 data.addr = 0;
3764 data.regs = get_irq_regs();
3765 /*
3766 * In case we exclude kernel IPs or are somehow not in interrupt
3767 * context, provide the next best thing, the user IP.
3768 */
3769 if ((counter->attr.exclude_kernel || !data.regs) &&
3770 !counter->attr.exclude_user)
3771 data.regs = task_pt_regs(current);
3772
3773 if (data.regs) {
3774 if (perf_counter_overflow(counter, 0, &data))
3775 ret = HRTIMER_NORESTART;
3776 }
3777
3778 period = max_t(u64, 10000, counter->hw.sample_period);
3779 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3780
3781 return ret;
3782}
3783
3784/*
3354 * Software counter: cpu wall time clock 3785 * Software counter: cpu wall time clock
3355 */ 3786 */
3356 3787
@@ -3467,17 +3898,24 @@ static const struct pmu perf_ops_task_clock = {
3467}; 3898};
3468 3899
3469#ifdef CONFIG_EVENT_PROFILE 3900#ifdef CONFIG_EVENT_PROFILE
3470void perf_tpcounter_event(int event_id) 3901void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3902 int entry_size)
3471{ 3903{
3904 struct perf_raw_record raw = {
3905 .size = entry_size,
3906 .data = record,
3907 };
3908
3472 struct perf_sample_data data = { 3909 struct perf_sample_data data = {
3473 .regs = get_irq_regs(); 3910 .regs = get_irq_regs(),
3474 .addr = 0, 3911 .addr = addr,
3912 .raw = &raw,
3475 }; 3913 };
3476 3914
3477 if (!data.regs) 3915 if (!data.regs)
3478 data.regs = task_pt_regs(current); 3916 data.regs = task_pt_regs(current);
3479 3917
3480 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); 3918 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3481} 3919}
3482EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3920EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3483 3921
@@ -3486,16 +3924,20 @@ extern void ftrace_profile_disable(int);
3486 3924
3487static void tp_perf_counter_destroy(struct perf_counter *counter) 3925static void tp_perf_counter_destroy(struct perf_counter *counter)
3488{ 3926{
3489 ftrace_profile_disable(perf_event_id(&counter->attr)); 3927 ftrace_profile_disable(counter->attr.config);
3490} 3928}
3491 3929
3492static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3930static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3493{ 3931{
3494 int event_id = perf_event_id(&counter->attr); 3932 /*
3495 int ret; 3933 * Raw tracepoint data is a severe data leak, only allow root to
3934 * have these.
3935 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3937 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM);
3496 3939
3497 ret = ftrace_profile_enable(event_id); 3940 if (ftrace_profile_enable(counter->attr.config))
3498 if (ret)
3499 return NULL; 3941 return NULL;
3500 3942
3501 counter->destroy = tp_perf_counter_destroy; 3943 counter->destroy = tp_perf_counter_destroy;
@@ -3509,9 +3951,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3509} 3951}
3510#endif 3952#endif
3511 3953
3954atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3955
3956static void sw_perf_counter_destroy(struct perf_counter *counter)
3957{
3958 u64 event = counter->attr.config;
3959
3960 WARN_ON(counter->parent);
3961
3962 atomic_dec(&perf_swcounter_enabled[event]);
3963}
3964
3512static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 3965static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3513{ 3966{
3514 const struct pmu *pmu = NULL; 3967 const struct pmu *pmu = NULL;
3968 u64 event = counter->attr.config;
3515 3969
3516 /* 3970 /*
3517 * Software counters (currently) can't in general distinguish 3971 * Software counters (currently) can't in general distinguish
@@ -3520,7 +3974,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3520 * to be kernel events, and page faults are never hypervisor 3974 * to be kernel events, and page faults are never hypervisor
3521 * events. 3975 * events.
3522 */ 3976 */
3523 switch (counter->attr.config) { 3977 switch (event) {
3524 case PERF_COUNT_SW_CPU_CLOCK: 3978 case PERF_COUNT_SW_CPU_CLOCK:
3525 pmu = &perf_ops_cpu_clock; 3979 pmu = &perf_ops_cpu_clock;
3526 3980
@@ -3541,6 +3995,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3541 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 3995 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3542 case PERF_COUNT_SW_CONTEXT_SWITCHES: 3996 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3543 case PERF_COUNT_SW_CPU_MIGRATIONS: 3997 case PERF_COUNT_SW_CPU_MIGRATIONS:
3998 if (!counter->parent) {
3999 atomic_inc(&perf_swcounter_enabled[event]);
4000 counter->destroy = sw_perf_counter_destroy;
4001 }
3544 pmu = &perf_ops_generic; 4002 pmu = &perf_ops_generic;
3545 break; 4003 break;
3546 } 4004 }
@@ -3556,6 +4014,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3556 int cpu, 4014 int cpu,
3557 struct perf_counter_context *ctx, 4015 struct perf_counter_context *ctx,
3558 struct perf_counter *group_leader, 4016 struct perf_counter *group_leader,
4017 struct perf_counter *parent_counter,
3559 gfp_t gfpflags) 4018 gfp_t gfpflags)
3560{ 4019{
3561 const struct pmu *pmu; 4020 const struct pmu *pmu;
@@ -3591,6 +4050,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3591 counter->ctx = ctx; 4050 counter->ctx = ctx;
3592 counter->oncpu = -1; 4051 counter->oncpu = -1;
3593 4052
4053 counter->parent = parent_counter;
4054
3594 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 4055 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3595 counter->id = atomic64_inc_return(&perf_counter_id); 4056 counter->id = atomic64_inc_return(&perf_counter_id);
3596 4057
@@ -3605,13 +4066,14 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3605 hwc->sample_period = attr->sample_period; 4066 hwc->sample_period = attr->sample_period;
3606 if (attr->freq && attr->sample_freq) 4067 if (attr->freq && attr->sample_freq)
3607 hwc->sample_period = 1; 4068 hwc->sample_period = 1;
4069 hwc->last_period = hwc->sample_period;
3608 4070
3609 atomic64_set(&hwc->period_left, hwc->sample_period); 4071 atomic64_set(&hwc->period_left, hwc->sample_period);
3610 4072
3611 /* 4073 /*
3612 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4074 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3613 */ 4075 */
3614 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4076 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3615 goto done; 4077 goto done;
3616 4078
3617 switch (attr->type) { 4079 switch (attr->type) {
@@ -3648,11 +4110,15 @@ done:
3648 4110
3649 counter->pmu = pmu; 4111 counter->pmu = pmu;
3650 4112
3651 atomic_inc(&nr_counters); 4113 if (!counter->parent) {
3652 if (counter->attr.mmap) 4114 atomic_inc(&nr_counters);
3653 atomic_inc(&nr_mmap_counters); 4115 if (counter->attr.mmap)
3654 if (counter->attr.comm) 4116 atomic_inc(&nr_mmap_counters);
3655 atomic_inc(&nr_comm_counters); 4117 if (counter->attr.comm)
4118 atomic_inc(&nr_comm_counters);
4119 if (counter->attr.task)
4120 atomic_inc(&nr_task_counters);
4121 }
3656 4122
3657 return counter; 4123 return counter;
3658} 4124}
@@ -3815,7 +4281,7 @@ SYSCALL_DEFINE5(perf_counter_open,
3815 } 4281 }
3816 4282
3817 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3818 GFP_KERNEL); 4284 NULL, GFP_KERNEL);
3819 ret = PTR_ERR(counter); 4285 ret = PTR_ERR(counter);
3820 if (IS_ERR(counter)) 4286 if (IS_ERR(counter))
3821 goto err_put_context; 4287 goto err_put_context;
@@ -3881,7 +4347,8 @@ inherit_counter(struct perf_counter *parent_counter,
3881 4347
3882 child_counter = perf_counter_alloc(&parent_counter->attr, 4348 child_counter = perf_counter_alloc(&parent_counter->attr,
3883 parent_counter->cpu, child_ctx, 4349 parent_counter->cpu, child_ctx,
3884 group_leader, GFP_KERNEL); 4350 group_leader, parent_counter,
4351 GFP_KERNEL);
3885 if (IS_ERR(child_counter)) 4352 if (IS_ERR(child_counter))
3886 return child_counter; 4353 return child_counter;
3887 get_ctx(child_ctx); 4354 get_ctx(child_ctx);
@@ -3904,12 +4371,6 @@ inherit_counter(struct perf_counter *parent_counter,
3904 */ 4371 */
3905 add_counter_to_ctx(child_counter, child_ctx); 4372 add_counter_to_ctx(child_counter, child_ctx);
3906 4373
3907 child_counter->parent = parent_counter;
3908 /*
3909 * inherit into child's child as well:
3910 */
3911 child_counter->attr.inherit = 1;
3912
3913 /* 4374 /*
3914 * Get a reference to the parent filp - we will fput it 4375 * Get a reference to the parent filp - we will fput it
3915 * when the child counter exits. This is safe to do because 4376 * when the child counter exits. This is safe to do because
@@ -3953,10 +4414,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3953} 4414}
3954 4415
3955static void sync_child_counter(struct perf_counter *child_counter, 4416static void sync_child_counter(struct perf_counter *child_counter,
3956 struct perf_counter *parent_counter) 4417 struct task_struct *child)
3957{ 4418{
4419 struct perf_counter *parent_counter = child_counter->parent;
3958 u64 child_val; 4420 u64 child_val;
3959 4421
4422 if (child_counter->attr.inherit_stat)
4423 perf_counter_read_event(child_counter, child);
4424
3960 child_val = atomic64_read(&child_counter->count); 4425 child_val = atomic64_read(&child_counter->count);
3961 4426
3962 /* 4427 /*
@@ -3985,7 +4450,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3985 4450
3986static void 4451static void
3987__perf_counter_exit_task(struct perf_counter *child_counter, 4452__perf_counter_exit_task(struct perf_counter *child_counter,
3988 struct perf_counter_context *child_ctx) 4453 struct perf_counter_context *child_ctx,
4454 struct task_struct *child)
3989{ 4455{
3990 struct perf_counter *parent_counter; 4456 struct perf_counter *parent_counter;
3991 4457
@@ -3999,7 +4465,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3999 * counters need to be zapped - but otherwise linger. 4465 * counters need to be zapped - but otherwise linger.
4000 */ 4466 */
4001 if (parent_counter) { 4467 if (parent_counter) {
4002 sync_child_counter(child_counter, parent_counter); 4468 sync_child_counter(child_counter, child);
4003 free_counter(child_counter); 4469 free_counter(child_counter);
4004 } 4470 }
4005} 4471}
@@ -4013,8 +4479,10 @@ void perf_counter_exit_task(struct task_struct *child)
4013 struct perf_counter_context *child_ctx; 4479 struct perf_counter_context *child_ctx;
4014 unsigned long flags; 4480 unsigned long flags;
4015 4481
4016 if (likely(!child->perf_counter_ctxp)) 4482 if (likely(!child->perf_counter_ctxp)) {
4483 perf_counter_task(child, NULL, 0);
4017 return; 4484 return;
4485 }
4018 4486
4019 local_irq_save(flags); 4487 local_irq_save(flags);
4020 /* 4488 /*
@@ -4033,17 +4501,20 @@ void perf_counter_exit_task(struct task_struct *child)
4033 */ 4501 */
4034 spin_lock(&child_ctx->lock); 4502 spin_lock(&child_ctx->lock);
4035 child->perf_counter_ctxp = NULL; 4503 child->perf_counter_ctxp = NULL;
4036 if (child_ctx->parent_ctx) { 4504 /*
4037 /* 4505 * If this context is a clone; unclone it so it can't get
4038 * This context is a clone; unclone it so it can't get 4506 * swapped to another process while we're removing all
4039 * swapped to another process while we're removing all 4507 * the counters from it.
4040 * the counters from it. 4508 */
4041 */ 4509 unclone_ctx(child_ctx);
4042 put_ctx(child_ctx->parent_ctx); 4510 spin_unlock_irqrestore(&child_ctx->lock, flags);
4043 child_ctx->parent_ctx = NULL; 4511
4044 } 4512 /*
4045 spin_unlock(&child_ctx->lock); 4513 * Report the task dead after unscheduling the counters so that we
4046 local_irq_restore(flags); 4514 * won't get any samples after PERF_EVENT_EXIT. We can however still
4515 * get a few PERF_EVENT_READ events.
4516 */
4517 perf_counter_task(child, child_ctx, 0);
4047 4518
4048 /* 4519 /*
4049 * We can recurse on the same lock type through: 4520 * We can recurse on the same lock type through:
@@ -4061,7 +4532,7 @@ void perf_counter_exit_task(struct task_struct *child)
4061again: 4532again:
4062 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4533 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4063 list_entry) 4534 list_entry)
4064 __perf_counter_exit_task(child_counter, child_ctx); 4535 __perf_counter_exit_task(child_counter, child_ctx, child);
4065 4536
4066 /* 4537 /*
4067 * If the last counter was a group counter, it will have appended all 4538 * If the last counter was a group counter, it will have appended all
@@ -4264,6 +4735,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4264 perf_counter_init_cpu(cpu); 4735 perf_counter_init_cpu(cpu);
4265 break; 4736 break;
4266 4737
4738 case CPU_ONLINE:
4739 case CPU_ONLINE_FROZEN:
4740 hw_perf_counter_setup_online(cpu);
4741 break;
4742
4267 case CPU_DOWN_PREPARE: 4743 case CPU_DOWN_PREPARE:
4268 case CPU_DOWN_PREPARE_FROZEN: 4744 case CPU_DOWN_PREPARE_FROZEN:
4269 perf_counter_exit_cpu(cpu); 4745 perf_counter_exit_cpu(cpu);
@@ -4288,6 +4764,8 @@ void __init perf_counter_init(void)
4288{ 4764{
4289 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4765 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4290 (void *)(long)smp_processor_id()); 4766 (void *)(long)smp_processor_id());
4767 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4768 (void *)(long)smp_processor_id());
4291 register_cpu_notifier(&perf_cpu_nb); 4769 register_cpu_notifier(&perf_cpu_nb);
4292} 4770}
4293 4771
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 69911b5745eb..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
181 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace. 182 * under ptrace.
183 */ 183 */
184 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
185 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
186 goto out; 186 goto out;
187 187
188 task_lock(task); 188 task_lock(task);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1533,7 +1533,7 @@ void __init __rcu_init(void)
1533 int j; 1533 int j;
1534 struct rcu_node *rnp; 1534 struct rcu_node *rnp;
1535 1535
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1536 printk(KERN_INFO "Hierarchical RCU implementation.\n");
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1546,7 +1546,6 @@ void __init __rcu_init(void)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); 1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */ 1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb); 1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1549}
1551 1550
1552module_param(blimit, int, 0); 1551module_param(blimit, int, 0);
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5a..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { 1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1044 1043 spin_unlock(&lock->wait_lock);
1045 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1; 1045 return 1;
1047 } 1046 }
1048 1047
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050 1049
1051
1052 if (ret && !waiter->task) { 1050 if (ret && !waiter->task) {
1053 /* 1051 /*
1054 * Reset the return value. We might have 1052 * Reset the return value. We might have
diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e6..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -493,6 +493,7 @@ struct rt_rq {
493#endif 493#endif
494#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
496 int overloaded; 497 int overloaded;
497 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
498#endif 499#endif
@@ -2571,15 +2572,37 @@ static void __sched_fork(struct task_struct *p)
2571 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2572 2573
2573#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2574 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2575 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2576 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2577 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2578 p->se.sleep_max = 0; 2579
2579 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2580 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2581 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2582 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2583#endif 2606#endif
2584 2607
2585 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -6541,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6541 return 0; 6564 return 0;
6542} 6565}
6543 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6544static void __cond_resched(void) 6572static void __cond_resched(void)
6545{ 6573{
6546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6560,8 +6588,7 @@ static void __cond_resched(void)
6560 6588
6561int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6562{ 6590{
6563 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6564 system_state == SYSTEM_RUNNING) {
6565 __cond_resched(); 6592 __cond_resched();
6566 return 1; 6593 return 1;
6567 } 6594 }
@@ -6579,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6579 */ 6606 */
6580int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6581{ 6608{
6582 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6583 int ret = 0; 6610 int ret = 0;
6584 6611
6585 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6586 spin_unlock(lock); 6613 spin_unlock(lock);
6587 if (resched && need_resched()) 6614 if (resched)
6588 __cond_resched(); 6615 __cond_resched();
6589 else 6616 else
6590 cpu_relax(); 6617 cpu_relax();
@@ -6599,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6599{ 6626{
6600 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6601 6628
6602 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6603 local_bh_enable(); 6630 local_bh_enable();
6604 __cond_resched(); 6631 __cond_resched();
6605 local_bh_disable(); 6632 local_bh_disable();
@@ -7262,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7262static void calc_global_load_remove(struct rq *rq) 7289static void calc_global_load_remove(struct rq *rq)
7263{ 7290{
7264 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7265} 7293}
7266#endif /* CONFIG_HOTPLUG_CPU */ 7294#endif /* CONFIG_HOTPLUG_CPU */
7267 7295
@@ -7488,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7488 task_rq_unlock(rq, &flags); 7516 task_rq_unlock(rq, &flags);
7489 get_task_struct(p); 7517 get_task_struct(p);
7490 cpu_rq(cpu)->migration_thread = p; 7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7491 break; 7520 break;
7492 7521
7493 case CPU_ONLINE: 7522 case CPU_ONLINE:
@@ -7498,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7498 /* Update our root-domain */ 7527 /* Update our root-domain */
7499 rq = cpu_rq(cpu); 7528 rq = cpu_rq(cpu);
7500 spin_lock_irqsave(&rq->lock, flags); 7529 spin_lock_irqsave(&rq->lock, flags);
7501 rq->calc_load_update = calc_load_update;
7502 rq->calc_load_active = 0;
7503 if (rq->rd) { 7530 if (rq->rd) {
7504 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7531 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7505 7532
@@ -9070,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9070#ifdef CONFIG_SMP 9097#ifdef CONFIG_SMP
9071 rt_rq->rt_nr_migratory = 0; 9098 rt_rq->rt_nr_migratory = 0;
9072 rt_rq->overloaded = 0; 9099 rt_rq->overloaded = 0;
9073 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9074#endif 9101#endif
9075 9102
9076 rt_rq->rt_time = 0; 9103 rt_rq->rt_time = 0;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6c251790dde..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba7fd6e9556f..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 266 return min_vruntime;
267} 267}
268 268
269static inline int entity_before(struct sched_entity *a,
270 struct sched_entity *b)
271{
272 return (s64)(a->vruntime - b->vruntime) < 0;
273}
274
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 275static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 276{
271 return se->vruntime - cfs_rq->min_vruntime; 277 return se->vruntime - cfs_rq->min_vruntime;
@@ -605,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 611static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
606{ 612{
607#ifdef CONFIG_SCHEDSTATS 613#ifdef CONFIG_SCHEDSTATS
614 struct task_struct *tsk = NULL;
615
616 if (entity_is_task(se))
617 tsk = task_of(se);
618
608 if (se->sleep_start) { 619 if (se->sleep_start) {
609 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
610 struct task_struct *tsk = task_of(se);
611 621
612 if ((s64)delta < 0) 622 if ((s64)delta < 0)
613 delta = 0; 623 delta = 0;
@@ -618,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
618 se->sleep_start = 0; 628 se->sleep_start = 0;
619 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
620 630
621 account_scheduler_latency(tsk, delta >> 10, 1); 631 if (tsk)
632 account_scheduler_latency(tsk, delta >> 10, 1);
622 } 633 }
623 if (se->block_start) { 634 if (se->block_start) {
624 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
625 struct task_struct *tsk = task_of(se);
626 636
627 if ((s64)delta < 0) 637 if ((s64)delta < 0)
628 delta = 0; 638 delta = 0;
@@ -633,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 se->block_start = 0; 643 se->block_start = 0;
634 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
635 645
636 /* 646 if (tsk) {
637 * Blocking time is in units of nanosecs, so shift by 20 to 647 /*
638 * get a milliseconds-range estimation of the amount of 648 * Blocking time is in units of nanosecs, so shift by
639 * time that the task spent sleeping: 649 * 20 to get a milliseconds-range estimation of the
640 */ 650 * amount of time that the task spent sleeping:
641 if (unlikely(prof_on == SLEEP_PROFILING)) { 651 */
642 652 if (unlikely(prof_on == SLEEP_PROFILING)) {
643 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 653 profile_hits(SLEEP_PROFILING,
644 delta >> 20); 654 (void *)get_wchan(tsk),
655 delta >> 20);
656 }
657 account_scheduler_latency(tsk, delta >> 10, 0);
645 } 658 }
646 account_scheduler_latency(tsk, delta >> 10, 0);
647 } 659 }
648#endif 660#endif
649} 661}
@@ -687,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
687 * all of which have the same weight. 699 * all of which have the same weight.
688 */ 700 */
689 if (sched_feat(NORMALIZED_SLEEPER) && 701 if (sched_feat(NORMALIZED_SLEEPER) &&
690 task_of(se)->policy != SCHED_IDLE) 702 (!entity_is_task(se) ||
703 task_of(se)->policy != SCHED_IDLE))
691 thresh = calc_delta_fair(thresh, se); 704 thresh = calc_delta_fair(thresh, se);
692 705
693 vruntime -= thresh; 706 vruntime -= thresh;
@@ -1016,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
1016 /* 1029 /*
1017 * Already in the rightmost position? 1030 * Already in the rightmost position?
1018 */ 1031 */
1019 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1032 if (unlikely(!rightmost || entity_before(rightmost, se)))
1020 return; 1033 return;
1021 1034
1022 /* 1035 /*
@@ -1712,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1712 1725
1713 /* 'curr' will be NULL if the child belongs to a different group */ 1726 /* 'curr' will be NULL if the child belongs to a different group */
1714 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1727 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1715 curr && curr->vruntime < se->vruntime) { 1728 curr && entity_before(curr, se)) {
1716 /* 1729 /*
1717 * Upon rescheduling, sched_class::put_prev_task() will place 1730 * Upon rescheduling, sched_class::put_prev_task() will place
1718 * 'current' within the tree based on its new key value. 1731 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
10 10
11#ifdef CONFIG_RT_GROUP_SCHED 11#ifdef CONFIG_RT_GROUP_SCHED
12 12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 16{
15 return rt_rq->rq; 17 return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 24
23#else /* CONFIG_RT_GROUP_SCHED */ 25#else /* CONFIG_RT_GROUP_SCHED */
24 26
27#define rt_entity_is_task(rt_se) (1)
28
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 30{
27 return container_of(rt_rq, struct rq, rt); 31 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 77
74static void update_rt_migration(struct rt_rq *rt_rq) 78static void update_rt_migration(struct rt_rq *rt_rq)
75{ 79{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 80 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 81 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 82 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 83 rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 90
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 92{
93 if (!rt_entity_is_task(rt_se))
94 return;
95
96 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
97
98 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 99 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 100 rt_rq->rt_nr_migratory++;
91 101
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 104
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 106{
107 if (!rt_entity_is_task(rt_se))
108 return;
109
110 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
111
112 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 113 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 114 rt_rq->rt_nr_migratory--;
99 115
diff --git a/kernel/signal.c b/kernel/signal.c
index ccf1ceedaebe..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2454 stack_t oss; 2454 stack_t oss;
2455 int error; 2455 int error;
2456 2456
2457 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2458 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2459 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2460 oss.ss_flags = sas_ss_flags(sp);
2461 }
2462 2460
2463 if (uss) { 2461 if (uss) {
2464 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2466 int ss_flags; 2464 int ss_flags;
2467 2465
2468 error = -EFAULT; 2466 error = -EFAULT;
2469 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2470 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2471 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2472 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2473 goto out; 2473 goto out;
2474 2474
2475 error = -EPERM; 2475 error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2501 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2502 } 2502 }
2503 2503
2504 error = 0;
2504 if (uoss) { 2505 if (uoss) {
2505 error = -EFAULT; 2506 error = -EFAULT;
2506 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2507 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2508 } 2512 }
2509 2513
2510 error = 0;
2511out: 2514out:
2512 return error; 2515 return error;
2513} 2516}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..94188b8ecc33 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3a94905fa5d2..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
345 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
346} 346}
347 347
348/* Tasklets */ 348/*
349 * Tasklets
350 */
349struct tasklet_head 351struct tasklet_head
350{ 352{
351 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
493 495
494EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
495 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
496DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
497EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
498 560
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..58be76017fd0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/acpi.h> 49#include <linux/acpi.h>
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/security.h>
52#include <linux/slow-work.h> 53#include <linux/slow-work.h>
53#include <linux/perf_counter.h> 54#include <linux/perf_counter.h>
54 55
@@ -335,7 +336,10 @@ static struct ctl_table kern_table[] = {
335 .data = &sysctl_timer_migration, 336 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 337 .maxlen = sizeof(unsigned int),
337 .mode = 0644, 338 .mode = 0644,
338 .proc_handler = &proc_dointvec, 339 .proc_handler = &proc_dointvec_minmax,
340 .strategy = &sysctl_intvec,
341 .extra1 = &zero,
342 .extra2 = &one,
339 }, 343 },
340#endif 344#endif
341 { 345 {
@@ -744,6 +748,14 @@ static struct ctl_table kern_table[] = {
744 .proc_handler = &proc_dointvec, 748 .proc_handler = &proc_dointvec,
745 }, 749 },
746 { 750 {
751 .ctl_name = CTL_UNNUMBERED,
752 .procname = "panic_on_io_nmi",
753 .data = &panic_on_io_nmi,
754 .maxlen = sizeof(int),
755 .mode = 0644,
756 .proc_handler = &proc_dointvec,
757 },
758 {
747 .ctl_name = KERN_BOOTLOADER_TYPE, 759 .ctl_name = KERN_BOOTLOADER_TYPE,
748 .procname = "bootloader_type", 760 .procname = "bootloader_type",
749 .data = &bootloader_type, 761 .data = &bootloader_type,
@@ -1295,10 +1307,10 @@ static struct ctl_table vm_table[] = {
1295 { 1307 {
1296 .ctl_name = CTL_UNNUMBERED, 1308 .ctl_name = CTL_UNNUMBERED,
1297 .procname = "mmap_min_addr", 1309 .procname = "mmap_min_addr",
1298 .data = &mmap_min_addr, 1310 .data = &dac_mmap_min_addr,
1299 .maxlen = sizeof(unsigned long), 1311 .maxlen = sizeof(unsigned long),
1300 .mode = 0644, 1312 .mode = 0644,
1301 .proc_handler = &proc_doulongvec_minmax, 1313 .proc_handler = &mmap_min_addr_handler,
1302 }, 1314 },
1303#ifdef CONFIG_NUMA 1315#ifdef CONFIG_NUMA
1304 { 1316 {
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..620b58abdc32 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
137 */ 137 */
138int clockevents_register_notifier(struct notifier_block *nb) 138int clockevents_register_notifier(struct notifier_block *nb)
139{ 139{
140 unsigned long flags;
140 int ret; 141 int ret;
141 142
142 spin_lock(&clockevents_lock); 143 spin_lock_irqsave(&clockevents_lock, flags);
143 ret = raw_notifier_chain_register(&clockevents_chain, nb); 144 ret = raw_notifier_chain_register(&clockevents_chain, nb);
144 spin_unlock(&clockevents_lock); 145 spin_unlock_irqrestore(&clockevents_lock, flags);
145 146
146 return ret; 147 return ret;
147} 148}
@@ -178,16 +179,18 @@ static void clockevents_notify_released(void)
178 */ 179 */
179void clockevents_register_device(struct clock_event_device *dev) 180void clockevents_register_device(struct clock_event_device *dev)
180{ 181{
182 unsigned long flags;
183
181 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
182 BUG_ON(!dev->cpumask); 185 BUG_ON(!dev->cpumask);
183 186
184 spin_lock(&clockevents_lock); 187 spin_lock_irqsave(&clockevents_lock, flags);
185 188
186 list_add(&dev->list, &clockevent_devices); 189 list_add(&dev->list, &clockevent_devices);
187 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
188 clockevents_notify_released(); 191 clockevents_notify_released();
189 192
190 spin_unlock(&clockevents_lock); 193 spin_unlock_irqrestore(&clockevents_lock, flags);
191} 194}
192EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
193 196
@@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
235void clockevents_notify(unsigned long reason, void *arg) 238void clockevents_notify(unsigned long reason, void *arg)
236{ 239{
237 struct list_head *node, *tmp; 240 struct list_head *node, *tmp;
241 unsigned long flags;
238 242
239 spin_lock(&clockevents_lock); 243 spin_lock_irqsave(&clockevents_lock, flags);
240 clockevents_do_notify(reason, arg); 244 clockevents_do_notify(reason, arg);
241 245
242 switch (reason) { 246 switch (reason) {
@@ -251,18 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
251 default: 255 default:
252 break; 256 break;
253 } 257 }
254 spin_unlock(&clockevents_lock); 258 spin_unlock_irqrestore(&clockevents_lock, flags);
255} 259}
256EXPORT_SYMBOL_GPL(clockevents_notify); 260EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 261#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
513 * Check to make sure we don't switch to a non-highres capable 513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz) 514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */ 515 */
516 if (tick_oneshot_mode_active() && 516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { 517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. " 518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name); 519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 877dbedc3118..c2ec25087a35 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
205 * Powerstate information: The system enters/leaves a state, where 205 * Powerstate information: The system enters/leaves a state, where
206 * affected devices might stop 206 * affected devices might stop
207 */ 207 */
208static void tick_do_broadcast_on_off(void *why) 208static void tick_do_broadcast_on_off(unsigned long *reason)
209{ 209{
210 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
211 struct tick_device *td; 211 struct tick_device *td;
212 unsigned long flags, *reason = why; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
279 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 279 tick_do_broadcast_on_off(&reason);
280 &reason, 1);
281} 280}
282 281
283/* 282/*
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a1277..fddd69d16e03 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
286{ 286{
287 struct proc_dir_entry *pe; 287 struct proc_dir_entry *pe;
288 288
289 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); 289 pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
290 if (!pe) 290 if (!pe)
291 return -ENOMEM; 291 return -ENOMEM;
292 return 0; 292 return 0;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
@@ -712,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
712 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
713 * to be the same thing then just return: 715 * to be the same thing then just return:
714 */ 716 */
715 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
716 return 1; 718 return 1;
717 719
718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e7669..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -226,13 +226,13 @@ config BOOT_TRACER
226 the timings of the initcalls and traces key events and the identity 226 the timings of the initcalls and traces key events and the identity
227 of tasks that can cause boot delays, such as context-switches. 227 of tasks that can cause boot delays, such as context-switches.
228 228
229 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 229 Its aim is to be parsed by the scripts/bootgraph.pl tool to
230 produce pretty graphics about boot inefficiencies, giving a visual 230 produce pretty graphics about boot inefficiencies, giving a visual
231 representation of the delays during initcalls - but the raw 231 representation of the delays during initcalls - but the raw
232 /debug/tracing/trace text output is readable too. 232 /debug/tracing/trace text output is readable too.
233 233
234 You must pass in ftrace=initcall to the kernel command line 234 You must pass in initcall_debug and ftrace=initcall to the kernel
235 to enable this on bootup. 235 command line to enable this on bootup.
236 236
237config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
238 bool 238 bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..7a34cb563fec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
@@ -266,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)
266{ 267{
267 debugfs_remove(bt->msg_file); 268 debugfs_remove(bt->msg_file);
268 debugfs_remove(bt->dropped_file); 269 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
270 relay_close(bt->rchan); 270 relay_close(bt->rchan);
271 debugfs_remove(bt->dir);
271 free_percpu(bt->sequence); 272 free_percpu(bt->sequence);
272 free_percpu(bt->msg_data); 273 free_percpu(bt->msg_data);
273 kfree(bt); 274 kfree(bt);
@@ -377,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
377 378
378static int blk_remove_buf_file_callback(struct dentry *dentry) 379static int blk_remove_buf_file_callback(struct dentry *dentry)
379{ 380{
380 struct dentry *parent = dentry->d_parent;
381 debugfs_remove(dentry); 381 debugfs_remove(dentry);
382 382
383 /*
384 * this will fail for all but the last file, but that is ok. what we
385 * care about is the top level buts->name directory going away, when
386 * the last trace file is gone. Then we don't have to rmdir() that
387 * manually on trace stop, so it nicely solves the issue with
388 * force killing of running traces.
389 */
390
391 debugfs_remove(parent);
392 return 0; 383 return 0;
393} 384}
394 385
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3718d55fb4c3..25edd5cc5935 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -766,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
766 .stat_show = function_stat_show 768 .stat_show = function_stat_show
767}; 769};
768 770
769static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
770{ 772{
771 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
772 struct dentry *entry; 774 struct dentry *entry;
@@ -784,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
784 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
785 * we still do not free memory. 787 * we still do not free memory.
786 */ 788 */
787 kfree(stat);
788 WARN(1, 789 WARN(1,
789 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
790 cpu); 791 cpu);
@@ -811,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
811} 812}
812 813
813#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
815{ 816{
816} 817}
817#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1417,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417{ 1418{
1418 struct ftrace_iterator *iter = m->private; 1419 struct ftrace_iterator *iter = m->private;
1419 void *p = NULL; 1420 void *p = NULL;
1421 loff_t l;
1422
1423 if (!(iter->flags & FTRACE_ITER_HASH))
1424 *pos = 0;
1420 1425
1421 iter->flags |= FTRACE_ITER_HASH; 1426 iter->flags |= FTRACE_ITER_HASH;
1422 1427
1423 return t_hash_next(m, p, pos); 1428 iter->hidx = 0;
1429 for (l = 0; l <= *pos; ) {
1430 p = t_hash_next(m, p, &l);
1431 if (!p)
1432 break;
1433 }
1434 return p;
1424} 1435}
1425 1436
1426static int t_hash_show(struct seq_file *m, void *v) 1437static int t_hash_show(struct seq_file *m, void *v)
@@ -1467,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 iter->pg = iter->pg->next; 1478 iter->pg = iter->pg->next;
1468 iter->idx = 0; 1479 iter->idx = 0;
1469 goto retry; 1480 goto retry;
1470 } else {
1471 iter->idx = -1;
1472 } 1481 }
1473 } else { 1482 } else {
1474 rec = &iter->pg->records[iter->idx++]; 1483 rec = &iter->pg->records[iter->idx++];
@@ -1497,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1497{ 1506{
1498 struct ftrace_iterator *iter = m->private; 1507 struct ftrace_iterator *iter = m->private;
1499 void *p = NULL; 1508 void *p = NULL;
1509 loff_t l;
1500 1510
1501 mutex_lock(&ftrace_lock); 1511 mutex_lock(&ftrace_lock);
1502 /* 1512 /*
@@ -1508,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1508 if (*pos > 0) 1518 if (*pos > 0)
1509 return t_hash_start(m, pos); 1519 return t_hash_start(m, pos);
1510 iter->flags |= FTRACE_ITER_PRINTALL; 1520 iter->flags |= FTRACE_ITER_PRINTALL;
1511 (*pos)++;
1512 return iter; 1521 return iter;
1513 } 1522 }
1514 1523
1515 if (iter->flags & FTRACE_ITER_HASH) 1524 if (iter->flags & FTRACE_ITER_HASH)
1516 return t_hash_start(m, pos); 1525 return t_hash_start(m, pos);
1517 1526
1518 if (*pos > 0) { 1527 iter->pg = ftrace_pages_start;
1519 if (iter->idx < 0) 1528 iter->idx = 0;
1520 return p; 1529 for (l = 0; l <= *pos; ) {
1521 (*pos)--; 1530 p = t_next(m, p, &l);
1522 iter->idx--; 1531 if (!p)
1532 break;
1523 } 1533 }
1524 1534
1525 p = t_next(m, p, pos); 1535 if (!p && iter->flags & FTRACE_ITER_FILTER)
1526
1527 if (!p)
1528 return t_hash_start(m, pos); 1536 return t_hash_start(m, pos);
1529 1537
1530 return p; 1538 return p;
@@ -1654,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1654 1662
1655 mutex_lock(&ftrace_regex_lock); 1663 mutex_lock(&ftrace_regex_lock);
1656 if ((file->f_mode & FMODE_WRITE) && 1664 if ((file->f_mode & FMODE_WRITE) &&
1657 !(file->f_flags & O_APPEND)) 1665 (file->f_flags & O_TRUNC))
1658 ftrace_filter_reset(enable); 1666 ftrace_filter_reset(enable);
1659 1667
1660 if (file->f_mode & FMODE_READ) { 1668 if (file->f_mode & FMODE_READ) {
@@ -2270,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2270 read++; 2278 read++;
2271 cnt--; 2279 cnt--;
2272 2280
2273 if (!(iter->flags & ~FTRACE_ITER_CONT)) { 2281 /*
2282 * If the parser haven't finished with the last write,
2283 * continue reading the user input without skipping spaces.
2284 */
2285 if (!(iter->flags & FTRACE_ITER_CONT)) {
2274 /* skip white space */ 2286 /* skip white space */
2275 while (cnt && isspace(ch)) { 2287 while (cnt && isspace(ch)) {
2276 ret = get_user(ch, ubuf++); 2288 ret = get_user(ch, ubuf++);
@@ -2280,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2280 cnt--; 2292 cnt--;
2281 } 2293 }
2282 2294
2295 /* only spaces were written */
2283 if (isspace(ch)) { 2296 if (isspace(ch)) {
2284 file->f_pos += read; 2297 *ppos += read;
2285 ret = read; 2298 ret = read;
2286 goto out; 2299 goto out;
2287 } 2300 }
@@ -2311,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2311 if (ret) 2324 if (ret)
2312 goto out; 2325 goto out;
2313 iter->buffer_idx = 0; 2326 iter->buffer_idx = 0;
2314 } else 2327 } else {
2315 iter->flags |= FTRACE_ITER_CONT; 2328 iter->flags |= FTRACE_ITER_CONT;
2329 iter->buffer[iter->buffer_idx++] = ch;
2330 }
2316 2331
2317 2332 *ppos += read;
2318 file->f_pos += read;
2319
2320 ret = read; 2333 ret = read;
2321 out: 2334 out:
2322 mutex_unlock(&ftrace_regex_lock); 2335 mutex_unlock(&ftrace_regex_lock);
@@ -2500,32 +2513,31 @@ int ftrace_graph_count;
2500unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2513unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2501 2514
2502static void * 2515static void *
2503g_next(struct seq_file *m, void *v, loff_t *pos) 2516__g_next(struct seq_file *m, loff_t *pos)
2504{ 2517{
2505 unsigned long *array = m->private; 2518 unsigned long *array = m->private;
2506 int index = *pos;
2507 2519
2508 (*pos)++; 2520 if (*pos >= ftrace_graph_count)
2509
2510 if (index >= ftrace_graph_count)
2511 return NULL; 2521 return NULL;
2522 return &array[*pos];
2523}
2512 2524
2513 return &array[index]; 2525static void *
2526g_next(struct seq_file *m, void *v, loff_t *pos)
2527{
2528 (*pos)++;
2529 return __g_next(m, pos);
2514} 2530}
2515 2531
2516static void *g_start(struct seq_file *m, loff_t *pos) 2532static void *g_start(struct seq_file *m, loff_t *pos)
2517{ 2533{
2518 void *p = NULL;
2519
2520 mutex_lock(&graph_lock); 2534 mutex_lock(&graph_lock);
2521 2535
2522 /* Nothing, tell g_show to print all functions are enabled */ 2536 /* Nothing, tell g_show to print all functions are enabled */
2523 if (!ftrace_graph_count && !*pos) 2537 if (!ftrace_graph_count && !*pos)
2524 return (void *)1; 2538 return (void *)1;
2525 2539
2526 p = g_next(m, p, pos); 2540 return __g_next(m, pos);
2527
2528 return p;
2529} 2541}
2530 2542
2531static void g_stop(struct seq_file *m, void *p) 2543static void g_stop(struct seq_file *m, void *p)
@@ -2570,7 +2582,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2570 2582
2571 mutex_lock(&graph_lock); 2583 mutex_lock(&graph_lock);
2572 if ((file->f_mode & FMODE_WRITE) && 2584 if ((file->f_mode & FMODE_WRITE) &&
2573 !(file->f_flags & O_APPEND)) { 2585 (file->f_flags & O_TRUNC)) {
2574 ftrace_graph_count = 0; 2586 ftrace_graph_count = 0;
2575 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2587 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2576 } 2588 }
@@ -2589,6 +2601,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2589} 2601}
2590 2602
2591static int 2603static int
2604ftrace_graph_release(struct inode *inode, struct file *file)
2605{
2606 if (file->f_mode & FMODE_READ)
2607 seq_release(inode, file);
2608 return 0;
2609}
2610
2611static int
2592ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2612ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2593{ 2613{
2594 struct dyn_ftrace *rec; 2614 struct dyn_ftrace *rec;
@@ -2717,9 +2737,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2717} 2737}
2718 2738
2719static const struct file_operations ftrace_graph_fops = { 2739static const struct file_operations ftrace_graph_fops = {
2720 .open = ftrace_graph_open, 2740 .open = ftrace_graph_open,
2721 .read = seq_read, 2741 .read = seq_read,
2722 .write = ftrace_graph_write, 2742 .write = ftrace_graph_write,
2743 .release = ftrace_graph_release,
2723}; 2744};
2724#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2745#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2725 2746
@@ -3152,10 +3173,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3152 3173
3153 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3174 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3154 3175
3155 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3176 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3156 goto out; 3177 goto out;
3157 3178
3158 last_ftrace_enabled = ftrace_enabled; 3179 last_ftrace_enabled = !!ftrace_enabled;
3159 3180
3160 if (ftrace_enabled) { 3181 if (ftrace_enabled) {
3161 3182
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 04dac2638258..a330513d96ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
735 735
736 put_online_cpus(); 736 put_online_cpus();
737 737
738 kfree(buffer->buffers);
738 free_cpumask_var(buffer->cpumask); 739 free_cpumask_var(buffer->cpumask);
739 740
740 kfree(buffer); 741 kfree(buffer);
@@ -1563,6 +1564,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1563 return NULL; 1564 return NULL;
1564} 1565}
1565 1566
1567#ifdef CONFIG_TRACING
1568
1566#define TRACE_RECURSIVE_DEPTH 16 1569#define TRACE_RECURSIVE_DEPTH 16
1567 1570
1568static int trace_recursive_lock(void) 1571static int trace_recursive_lock(void)
@@ -1593,6 +1596,13 @@ static void trace_recursive_unlock(void)
1593 current->trace_recursion--; 1596 current->trace_recursion--;
1594} 1597}
1595 1598
1599#else
1600
1601#define trace_recursive_lock() (0)
1602#define trace_recursive_unlock() do { } while (0)
1603
1604#endif
1605
1596static DEFINE_PER_CPU(int, rb_need_resched); 1606static DEFINE_PER_CPU(int, rb_need_resched);
1597 1607
1598/** 1608/**
@@ -1776,7 +1786,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1776 */ 1786 */
1777 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1778 1788
1779 if (!rb_try_to_discard(cpu_buffer, event)) 1789 if (rb_try_to_discard(cpu_buffer, event))
1780 goto out; 1790 goto out;
1781 1791
1782 /* 1792 /*
@@ -2374,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2374 * the box. Return the padding, and we will release 2384 * the box. Return the padding, and we will release
2375 * the current locks, and try again. 2385 * the current locks, and try again.
2376 */ 2386 */
2377 rb_advance_reader(cpu_buffer);
2378 return event; 2387 return event;
2379 2388
2380 case RINGBUF_TYPE_TIME_EXTEND: 2389 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2477,7 +2486,7 @@ static inline int rb_ok_to_lock(void)
2477 * buffer too. A one time deal is all you get from reading 2486 * buffer too. A one time deal is all you get from reading
2478 * the ring buffer from an NMI. 2487 * the ring buffer from an NMI.
2479 */ 2488 */
2480 if (likely(!in_nmi() && !oops_in_progress)) 2489 if (likely(!in_nmi()))
2481 return 1; 2490 return 1;
2482 2491
2483 tracing_off_permanent(); 2492 tracing_off_permanent();
@@ -2510,6 +2519,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2510 if (dolock) 2519 if (dolock)
2511 spin_lock(&cpu_buffer->reader_lock); 2520 spin_lock(&cpu_buffer->reader_lock);
2512 event = rb_buffer_peek(buffer, cpu, ts); 2521 event = rb_buffer_peek(buffer, cpu, ts);
2522 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2523 rb_advance_reader(cpu_buffer);
2513 if (dolock) 2524 if (dolock)
2514 spin_unlock(&cpu_buffer->reader_lock); 2525 spin_unlock(&cpu_buffer->reader_lock);
2515 local_irq_restore(flags); 2526 local_irq_restore(flags);
@@ -2581,12 +2592,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2581 spin_lock(&cpu_buffer->reader_lock); 2592 spin_lock(&cpu_buffer->reader_lock);
2582 2593
2583 event = rb_buffer_peek(buffer, cpu, ts); 2594 event = rb_buffer_peek(buffer, cpu, ts);
2584 if (!event) 2595 if (event)
2585 goto out_unlock; 2596 rb_advance_reader(cpu_buffer);
2586
2587 rb_advance_reader(cpu_buffer);
2588 2597
2589 out_unlock:
2590 if (dolock) 2598 if (dolock)
2591 spin_unlock(&cpu_buffer->reader_lock); 2599 spin_unlock(&cpu_buffer->reader_lock);
2592 local_irq_restore(flags); 2600 local_irq_restore(flags);
@@ -3104,6 +3112,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3104} 3112}
3105EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3113EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3106 3114
3115#ifdef CONFIG_TRACING
3107static ssize_t 3116static ssize_t
3108rb_simple_read(struct file *filp, char __user *ubuf, 3117rb_simple_read(struct file *filp, char __user *ubuf,
3109 size_t cnt, loff_t *ppos) 3118 size_t cnt, loff_t *ppos)
@@ -3171,6 +3180,7 @@ static __init int rb_init_debugfs(void)
3171} 3180}
3172 3181
3173fs_initcall(rb_init_debugfs); 3182fs_initcall(rb_init_debugfs);
3183#endif
3174 3184
3175#ifdef CONFIG_HOTPLUG_CPU 3185#ifdef CONFIG_HOTPLUG_CPU
3176static int rb_cpu_notify(struct notifier_block *self, 3186static int rb_cpu_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 076fa6f0ee48..8c358395d338 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -284,13 +285,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
285{ 286{
286 unsigned long buf_size; 287 unsigned long buf_size;
287 int ret;
288 288
289 if (!str) 289 if (!str)
290 return 0; 290 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 291 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 292 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 293 if (buf_size == 0)
294 return 0; 294 return 0;
295 trace_buf_size = buf_size; 295 trace_buf_size = buf_size;
296 return 1; 296 return 1;
@@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
850} 850}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
851 852
852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
853 int type, 854 int type,
@@ -2031,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2031 2032
2032 /* If this file was open for write, then erase contents */ 2033 /* If this file was open for write, then erase contents */
2033 if ((file->f_mode & FMODE_WRITE) && 2034 if ((file->f_mode & FMODE_WRITE) &&
2034 !(file->f_flags & O_APPEND)) { 2035 (file->f_flags & O_TRUNC)) {
2035 long cpu = (long) inode->i_private; 2036 long cpu = (long) inode->i_private;
2036 2037
2037 if (cpu == TRACE_PIPE_ALL_CPU) 2038 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2053,25 +2054,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2054static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2055t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2056{
2056 struct tracer *t = m->private; 2057 struct tracer *t = v;
2057 2058
2058 (*pos)++; 2059 (*pos)++;
2059 2060
2060 if (t) 2061 if (t)
2061 t = t->next; 2062 t = t->next;
2062 2063
2063 m->private = t;
2064
2065 return t; 2064 return t;
2066} 2065}
2067 2066
2068static void *t_start(struct seq_file *m, loff_t *pos) 2067static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2068{
2070 struct tracer *t = m->private; 2069 struct tracer *t;
2071 loff_t l = 0; 2070 loff_t l = 0;
2072 2071
2073 mutex_lock(&trace_types_lock); 2072 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2073 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2074 ;
2076 2075
2077 return t; 2076 return t;
@@ -2107,18 +2106,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2106
2108static int show_traces_open(struct inode *inode, struct file *file) 2107static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2108{
2110 int ret;
2111
2112 if (tracing_disabled) 2109 if (tracing_disabled)
2113 return -ENODEV; 2110 return -ENODEV;
2114 2111
2115 ret = seq_open(file, &show_traces_seq_ops); 2112 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2113}
2123 2114
2124static ssize_t 2115static ssize_t
@@ -3095,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3095 break; 3086 break;
3096 } 3087 }
3097 3088
3098 trace_consume(iter); 3089 if (ret != TRACE_TYPE_NO_CONSUME)
3090 trace_consume(iter);
3099 rem -= count; 3091 rem -= count;
3100 if (!find_next_entry_inc(iter)) { 3092 if (!find_next_entry_inc(iter)) {
3101 rem = 0; 3093 rem = 0;
@@ -3904,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3904 if (ret < 0) 3896 if (ret < 0)
3905 return ret; 3897 return ret;
3906 3898
3907 switch (val) { 3899 if (val != 0 && val != 1)
3908 case 0:
3909 trace_flags &= ~(1 << index);
3910 break;
3911 case 1:
3912 trace_flags |= 1 << index;
3913 break;
3914
3915 default:
3916 return -EINVAL; 3900 return -EINVAL;
3917 } 3901 set_tracer_flags(1 << index, val);
3918 3902
3919 *ppos += cnt; 3903 *ppos += cnt;
3920 3904
@@ -4243,8 +4227,11 @@ static void __ftrace_dump(bool disable_tracing)
4243 iter.pos = -1; 4227 iter.pos = -1;
4244 4228
4245 if (find_next_entry_inc(&iter) != NULL) { 4229 if (find_next_entry_inc(&iter) != NULL) {
4246 print_trace_line(&iter); 4230 int ret;
4247 trace_consume(&iter); 4231
4232 ret = print_trace_line(&iter);
4233 if (ret != TRACE_TYPE_NO_CONSUME)
4234 trace_consume(&iter);
4248 } 4235 }
4249 4236
4250 trace_printk_seq(&iter.seq); 4237 trace_printk_seq(&iter.seq);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..8b9f4f6e9559 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
439 int *ent_cpu, u64 *ent_ts); 439 int *ent_cpu, u64 *ent_ts);
440 440
441void tracing_generic_entry_update(struct trace_entry *entry,
442 unsigned long flags,
443 int pc);
444
445void default_wait_pipe(struct trace_iterator *iter); 441void default_wait_pipe(struct trace_iterator *iter);
446void poll_wait_pipe(struct trace_iterator *iter); 442void poll_wait_pipe(struct trace_iterator *iter);
447 443
@@ -597,6 +593,7 @@ print_graph_function(struct trace_iterator *iter)
597 593
598extern struct pid *ftrace_pid_trace; 594extern struct pid *ftrace_pid_trace;
599 595
596#ifdef CONFIG_FUNCTION_TRACER
600static inline int ftrace_trace_task(struct task_struct *task) 597static inline int ftrace_trace_task(struct task_struct *task)
601{ 598{
602 if (!ftrace_pid_trace) 599 if (!ftrace_pid_trace)
@@ -604,6 +601,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
604 601
605 return test_tsk_trace_trace(task); 602 return test_tsk_trace_trace(task);
606} 603}
604#else
605static inline int ftrace_trace_task(struct task_struct *task)
606{
607 return 1;
608}
609#endif
607 610
608/* 611/*
609 * trace_iterator_flags is an enumeration that defines bit 612 * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)
14 14
15 mutex_lock(&event_mutex); 15 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 16 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 17 if (event->id == event_id && event->profile_enable) {
18 ret = event->profile_enable(event); 18 ret = event->profile_enable(event);
19 break; 19 break;
20 } 20 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..e75276a49cf5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 300
301static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
302{ 302{
303 struct ftrace_event_call *call = NULL;
304 loff_t l;
305
303 mutex_lock(&event_mutex); 306 mutex_lock(&event_mutex);
304 if (*pos == 0) 307
305 m->private = ftrace_events.next; 308 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 309 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l);
311 if (!call)
312 break;
313 }
314 return call;
307} 315}
308 316
309static void * 317static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 340
333static void *s_start(struct seq_file *m, loff_t *pos) 341static void *s_start(struct seq_file *m, loff_t *pos)
334{ 342{
343 struct ftrace_event_call *call = NULL;
344 loff_t l;
345
335 mutex_lock(&event_mutex); 346 mutex_lock(&event_mutex);
336 if (*pos == 0) 347
337 m->private = ftrace_events.next; 348 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 349 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l);
351 if (!call)
352 break;
353 }
354 return call;
339} 355}
340 356
341static int t_show(struct seq_file *m, void *v) 357static int t_show(struct seq_file *m, void *v)
@@ -360,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
360 const struct seq_operations *seq_ops; 376 const struct seq_operations *seq_ops;
361 377
362 if ((file->f_mode & FMODE_WRITE) && 378 if ((file->f_mode & FMODE_WRITE) &&
363 !(file->f_flags & O_APPEND)) 379 (file->f_flags & O_TRUNC))
364 ftrace_clear_events(); 380 ftrace_clear_events();
365 381
366 seq_ops = inode->i_private; 382 seq_ops = inode->i_private;
@@ -924,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
924 entry = trace_create_file("enable", 0644, call->dir, call, 940 entry = trace_create_file("enable", 0644, call->dir, call,
925 enable); 941 enable);
926 942
927 if (call->id) 943 if (call->id && call->profile_enable)
928 entry = trace_create_file("id", 0444, call->dir, call, 944 entry = trace_create_file("id", 0444, call->dir, call,
929 id); 945 id);
930 946
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621bbf46..f32dc9d1ea7b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
624 return -ENOSPC; 624 return -ENOSPC;
625 } 625 }
626 626
627 filter->preds[filter->n_preds] = pred;
628 filter->n_preds++;
629
630 list_for_each_entry(call, &ftrace_events, list) { 627 list_for_each_entry(call, &ftrace_events, list) {
631 628
632 if (!call->define_fields) 629 if (!call->define_fields)
@@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
643 } 640 }
644 replace_filter_string(call->filter, filter_string); 641 replace_filter_string(call->filter, filter_string);
645 } 642 }
643
644 filter->preds[filter->n_preds] = pred;
645 filter->n_preds++;
646out: 646out:
647 return err; 647 return err;
648} 648}
@@ -1029,12 +1029,17 @@ static int replace_preds(struct event_subsystem *system,
1029 1029
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1030 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1031 pred = create_logical_pred(elt->op);
1032 if (!pred)
1033 return -ENOMEM;
1032 if (call) { 1034 if (call) {
1033 err = filter_add_pred(ps, call, pred); 1035 err = filter_add_pred(ps, call, pred);
1034 filter_free_pred(pred); 1036 filter_free_pred(pred);
1035 } else 1037 } else {
1036 err = filter_add_subsystem_pred(ps, system, 1038 err = filter_add_subsystem_pred(ps, system,
1037 pred, filter_string); 1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1038 if (err) 1043 if (err)
1039 return err; 1044 return err;
1040 1045
@@ -1048,12 +1053,17 @@ static int replace_preds(struct event_subsystem *system,
1048 } 1053 }
1049 1054
1050 pred = create_pred(elt->op, operand1, operand2); 1055 pred = create_pred(elt->op, operand1, operand2);
1056 if (!pred)
1057 return -ENOMEM;
1051 if (call) { 1058 if (call) {
1052 err = filter_add_pred(ps, call, pred); 1059 err = filter_add_pred(ps, call, pred);
1053 filter_free_pred(pred); 1060 filter_free_pred(pred);
1054 } else 1061 } else {
1055 err = filter_add_subsystem_pred(ps, system, pred, 1062 err = filter_add_subsystem_pred(ps, system, pred,
1056 filter_string); 1063 filter_string);
1064 if (err)
1065 filter_free_pred(pred);
1066 }
1057 if (err) 1067 if (err)
1058 return err; 1068 return err;
1059 1069
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 90f134764837..75ef000613c3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -302,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
302 if (count == -1) 302 if (count == -1)
303 seq_printf(m, ":unlimited\n"); 303 seq_printf(m, ":unlimited\n");
304 else 304 else
305 seq_printf(m, ":count=%ld", count); 305 seq_printf(m, ":count=%ld\n", count);
306 seq_putc(m, '\n');
307 306
308 return 0; 307 return 0;
309} 308}
@@ -364,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
364 out_reg: 363 out_reg:
365 ret = register_ftrace_function_probe(glob, ops, count); 364 ret = register_ftrace_function_probe(glob, ops, count);
366 365
367 return ret; 366 return ret < 0 ? ret : 0;
368} 367}
369 368
370static struct ftrace_func_command ftrace_traceon_cmd = { 369static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb53..420ec3487579 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
843 843
844 switch (entry->type) { 844 switch (entry->type) {
845 case TRACE_GRAPH_ENT: { 845 case TRACE_GRAPH_ENT: {
846 struct ftrace_graph_ent_entry *field; 846 /*
847 * print_graph_entry() may consume the current event,
848 * thus @field may become invalid, so we need to save it.
849 * sizeof(struct ftrace_graph_ent_entry) is very small,
850 * it can be safely saved at the stack.
851 */
852 struct ftrace_graph_ent_entry *field, saved;
847 trace_assign_type(field, entry); 853 trace_assign_type(field, entry);
848 return print_graph_entry(field, s, iter); 854 saved = *field;
855 return print_graph_entry(&saved, s, iter);
849 } 856 }
850 case TRACE_GRAPH_RET: { 857 case TRACE_GRAPH_RET: {
851 struct ftrace_graph_ret_entry *field; 858 struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
182 const char *str = *fmt; 176 const char *str = *fmt;
183 int i; 177 int i;
184 178
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
186 180
187 /* 181 /*
188 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..6a2a9d484cd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 301
302static int stack_trace_open(struct inode *inode, struct file *file) 302static int stack_trace_open(struct inode *inode, struct file *file)
303{ 303{
304 int ret; 304 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 305}
310 306
311static const struct file_operations stack_trace_fops = { 307static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 308 .open = stack_trace_open,
313 .read = seq_read, 309 .read = seq_read,
314 .llseek = seq_lseek, 310 .llseek = seq_lseek,
311 .release = seq_release,
315}; 312};
316 313
317int 314int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 323 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 324
328 if (ret || !write || 325 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 326 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 327 goto out;
331 328
332 last_stack_tracer_enabled = stack_tracer_enabled; 329 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 330
334 if (stack_tracer_enabled) 331 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 332 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..aea321c82fa0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node)
73 } 73 }
74} 74}
75 75
76static void reset_stat_session(struct stat_session *session) 76static void __reset_stat_session(struct stat_session *session)
77{ 77{
78 struct rb_node *node = session->stat_root.rb_node; 78 struct rb_node *node = session->stat_root.rb_node;
79 79
@@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session)
83 session->stat_root = RB_ROOT; 83 session->stat_root = RB_ROOT;
84} 84}
85 85
86static void reset_stat_session(struct stat_session *session)
87{
88 mutex_lock(&session->stat_mutex);
89 __reset_stat_session(session);
90 mutex_unlock(&session->stat_mutex);
91}
92
86static void destroy_session(struct stat_session *session) 93static void destroy_session(struct stat_session *session)
87{ 94{
88 debugfs_remove(session->file); 95 debugfs_remove(session->file);
89 reset_stat_session(session); 96 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 97 mutex_destroy(&session->stat_mutex);
91 kfree(session); 98 kfree(session);
92} 99}
@@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 157 int i;
151 158
152 mutex_lock(&session->stat_mutex); 159 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 160 __reset_stat_session(session);
154 161
155 if (!ts->stat_cmp) 162 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 163 ts->stat_cmp = dummy_cmp;
@@ -183,7 +190,7 @@ exit:
183 return ret; 190 return ret;
184 191
185exit_free_rbtree: 192exit_free_rbtree:
186 reset_stat_session(session); 193 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 194 mutex_unlock(&session->stat_mutex);
188 return ret; 195 return ret;
189} 196}
@@ -199,17 +206,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
199 mutex_lock(&session->stat_mutex); 206 mutex_lock(&session->stat_mutex);
200 207
201 /* If we are in the beginning of the file, print the headers */ 208 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 209 if (!*pos && session->ts->stat_headers)
203 (*pos)++;
204 return SEQ_START_TOKEN; 210 return SEQ_START_TOKEN;
205 }
206 211
207 node = rb_first(&session->stat_root); 212 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 213 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node); 214 node = rb_next(node);
210 215
211 (*pos)++;
212
213 return node; 216 return node;
214} 217}
215 218
@@ -254,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = {
254static int tracing_stat_open(struct inode *inode, struct file *file) 257static int tracing_stat_open(struct inode *inode, struct file *file)
255{ 258{
256 int ret; 259 int ret;
257 260 struct seq_file *m;
258 struct stat_session *session = inode->i_private; 261 struct stat_session *session = inode->i_private;
259 262
263 ret = stat_seq_init(session);
264 if (ret)
265 return ret;
266
260 ret = seq_open(file, &trace_stat_seq_ops); 267 ret = seq_open(file, &trace_stat_seq_ops);
261 if (!ret) { 268 if (ret) {
262 struct seq_file *m = file->private_data; 269 reset_stat_session(session);
263 m->private = session; 270 return ret;
264 ret = stat_seq_init(session);
265 } 271 }
266 272
273 m = file->private_data;
274 m->private = session;
267 return ret; 275 return ret;
268} 276}
269 277
@@ -274,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
274{ 282{
275 struct stat_session *session = i->i_private; 283 struct stat_session *session = i->i_private;
276 284
277 mutex_lock(&session->stat_mutex);
278 reset_stat_session(session); 285 reset_stat_session(session);
279 mutex_unlock(&session->stat_mutex);
280 286
281 return 0; 287 return seq_release(i, f);
282} 288}
283 289
284static const struct file_operations tracing_stat_fops = { 290static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275cf..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{