Merge branch 'master' into next

author: James Morris <jmorris@namei.org> 2009-06-29 19:10:35 -0400
committer: James Morris <jmorris@namei.org> 2009-06-29 19:10:35 -0400
commit: ac7242142b03421c96b0a2f8d99f146d075614c2 (patch)
tree: b0b2ead65858c7a343d38affed86fe815e37e7e9 /kernel
parent: 89c86576ecde504da1eeb4f4882b2189ac2f9c4a (diff)
parent: 2bfdd79eaa0043346e773ba5f6cfd811ea31b73d (diff)
33 files changed, 1362 insertions, 1050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0a32cb21ec97..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 /* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
                kfree_skb(skb);
 }
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+        struct nlmsghdr *nlh = nlmsg_hdr(skb);
+        char *data = NLMSG_DATA(nlh);
+        if (nlh->nlmsg_type != AUDIT_EOE) {
+                if (printk_ratelimit())
+                        printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+                else
+                        audit_log_lost("printk limit exceeded\n");
+        }
+        audit_hold_skb(skb);
+}
 static void kauditd_send_skb(struct sk_buff *skb)
 {
        int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
                if (skb) {
                        if (audit_pid)
                                kauditd_send_skb(skb);
-                        else {
+                        else
-                                if (printk_ratelimit())
+                                audit_printk_skb(skb);
-                                        printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
-                                else
-                                        audit_log_lost("printk limit exceeded\n");
-                                audit_hold_skb(skb);
-                        }
                } else {
                        DECLARE_WAITQUEUE(wait, current);
                        set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
        return 0;
 }
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
-        mutex_lock(&audit_cmd_mutex);
-        audit_prune_trees();
-        mutex_unlock(&audit_cmd_mutex);
-        return 0;
-}
-void audit_schedule_prune(void)
-{
-        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
                                 int multi, void *payload, int size)
 {
        struct sk_buff  *skb;
        struct nlmsghdr *nlh;
-        int             len = NLMSG_SPACE(size);
        void            *data;
        int             flags = multi ? NLM_F_MULTI : 0;
        int             t     = done  ? NLMSG_DONE  : type;
-        skb = alloc_skb(len, GFP_KERNEL);
+        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return NULL;
-        nlh              = NLMSG_PUT(skb, pid, seq, t, size);
+        nlh     = NLMSG_NEW(skb, pid, seq, t, size, flags);
-        nlh->nlmsg_flags = flags;
+        data    = NLMSG_DATA(nlh);
-        data             = NLMSG_DATA(nlh);
        memcpy(data, payload, size);
        return skb;
-nlmsg_failure:                  /* Used by NLMSG_PUT */
+nlmsg_failure:                  /* Used by NLMSG_NEW */
        if (skb)
                kfree_skb(skb);
        return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 }
 /*
- * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * Get message from skb.  Each message is processed by audit_receive_msg.
- * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * Malformed skbs with wrong length are discarded silently.
- * discarded silently.
 */
 static void audit_receive_skb(struct sk_buff *skb)
 {
-        int             err;
+        struct nlmsghdr *nlh;
-        struct nlmsghdr *nlh;
+        /*
-        u32             rlen;
+         * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+         * if the nlmsg_len was not aligned
+         */
+        int len;
+        int err;
-        while (skb->len >= NLMSG_SPACE(0)) {
+        nlh = nlmsg_hdr(skb);
-                nlh = nlmsg_hdr(skb);
+        len = skb->len;
-                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-                        return;
+        while (NLMSG_OK(nlh, len)) {
-                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                err = audit_receive_msg(skb, nlh);
-                if (rlen > skb->len)
+                /* if err or if this message says it wants a response */
-                        rlen = skb->len;
+                if (err || (nlh->nlmsg_flags & NLM_F_ACK))
-                if ((err = audit_receive_msg(skb, nlh))) {
                        netlink_ack(skb, nlh, err);
-                } else if (nlh->nlmsg_flags & NLM_F_ACK)
-                        netlink_ack(skb, nlh, 0);
+                nlh = NLMSG_NEXT(nlh, len);
-                skb_pull(skb, rlen);
        }
 }
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff  *skb)
        mutex_unlock(&audit_cmd_mutex);
 }
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
-        .handle_event   = audit_handle_ievent,
-        .destroy_watch  = audit_free_parent,
-};
-#endif
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
@@ -991,12 +978,6 @@ static int __init audit_init(void)
        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-#ifdef CONFIG_AUDITSYSCALL
-        audit_ih = inotify_init(&audit_inotify_ops);
-        if (IS_ERR(audit_ih))
-                audit_panic("cannot initialize inotify handle");
-#endif
        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
                        goto err;
        }
-        ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
-        if (!ab->skb)
-                goto err;
        ab->ctx = ctx;
        ab->gfp_mask = gfp_mask;
-        nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-        nlh->nlmsg_type = type;
+        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
-        nlh->nlmsg_flags = 0;
+        if (!ab->skb)
-        nlh->nlmsg_pid = 0;
+                goto nlmsg_failure;
-        nlh->nlmsg_seq = 0;
+        nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
        return ab;
+nlmsg_failure:                  /* Used by NLMSG_NEW */
+        kfree_skb(ab->skb);
+        ab->skb = NULL;
 err:
        audit_buffer_free(ab);
        return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        kfree(pathname);
 }
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+        audit_log_format(ab, " key=");
+        if (key)
+                audit_log_untrustedstring(ab, key);
+        else
+                audit_log_format(ab, "(null)");
+}
 /**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
                        skb_queue_tail(&audit_skb_queue, ab->skb);
                        wake_up_interruptible(&kauditd_wait);
                } else {
-                        if (nlh->nlmsg_type != AUDIT_EOE) {
+                        audit_printk_skb(ab->skb);
-                                if (printk_ratelimit()) {
-                                        printk(KERN_NOTICE "type=%d %s\n",
-                                                nlh->nlmsg_type,
-                                                ab->skb->data + NLMSG_SPACE(0));
-                                } else
-                                        audit_log_lost("printk limit exceeded\n");
-                        }
-                        audit_hold_skb(ab->skb);
                }
                ab->skb = NULL;
        }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
 };
 /* Rule lists */
-struct audit_parent;
+struct audit_watch;
-struct audit_watch {
-        atomic_t                count;  /* reference count */
-        char                    *path;  /* insertion path */
-        dev_t                   dev;    /* associated superblock device */
-        unsigned long           ino;    /* associated inode number */
-        struct audit_parent     *parent; /* associated parent */
-        struct list_head        wlist;  /* entry in parent->watches list */
-        struct list_head        rules;  /* associated rules */
-};
 struct audit_tree;
 struct audit_chunk;
@@ -108,19 +97,28 @@ struct audit_netlink_list {
 int audit_send_list(void *);
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
-                                const char *, struct inode *);
 extern int selinux_audit_rule_update(void);
 extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
+/* audit watch functions */
+extern unsigned long audit_watch_inode(struct audit_watch *watch);
+extern dev_t audit_watch_dev(struct audit_watch *watch);
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule);
+extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
+extern void audit_inotify_unregister(struct list_head *in_list);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+                                           struct audit_watch *watch);
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
 extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
 extern int audit_remove_tree_rule(struct audit_krule *);
 extern void audit_trim_trees(void);
 extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
 extern const char *audit_tree_path(struct audit_tree *);
 extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
 #else
 #define audit_remove_tree_rule(rule) BUG()
 #define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
 #define audit_put_tree(tree) (void)0
 #define audit_tag_tree(old, new) -EINVAL
 #define audit_tree_path(rule) ""        /* never called */
+#define audit_kill_trees(list) BUG()
 #endif
 extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
        return 0;
 }
 extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
 #endif
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
 #include <linux/inotify.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/kthread.h>
 struct audit_tree;
 struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
                if (rule->tree) {
                        /* not a half-baked one */
                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "op=remove rule dir=");
+                        audit_log_format(ab, "op=");
+                        audit_log_string(ab, "remove rule");
+                        audit_log_format(ab, " dir=");
                        audit_log_untrustedstring(ab, rule->tree->pathname);
-                        if (rule->filterkey) {
+                        audit_log_key(ab, rule->filterkey);
-                                audit_log_format(ab, " key=");
-                                audit_log_untrustedstring(ab, rule->filterkey);
-                        } else
-                                audit_log_format(ab, " key=(null)");
                        audit_log_format(ab, " list=%d res=1", rule->listnr);
                        audit_log_end(ab);
                        rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
        }
 }
+static void audit_schedule_prune(void);
 /* called with audit_filter_mutex */
 int audit_remove_tree_rule(struct audit_krule *rule)
 {
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
 /*
 * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
 */
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
 {
+        mutex_lock(&audit_cmd_mutex);
        mutex_lock(&audit_filter_mutex);
        while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
        }
        mutex_unlock(&audit_filter_mutex);
+        mutex_unlock(&audit_cmd_mutex);
+        return 0;
+}
+static void audit_schedule_prune(void)
+{
+        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall.  Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+        mutex_lock(&audit_cmd_mutex);
+        mutex_lock(&audit_filter_mutex);
+        while (!list_empty(list)) {
+                struct audit_tree *victim;
+                victim = list_entry(list->next, struct audit_tree, list);
+                kill_rules(victim);
+                list_del_init(&victim->list);
+                mutex_unlock(&audit_filter_mutex);
+                prune_one(victim);
+                mutex_lock(&audit_filter_mutex);
+        }
+        mutex_unlock(&audit_filter_mutex);
+        mutex_unlock(&audit_cmd_mutex);
 }
 /*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
 static void evict_chunk(struct audit_chunk *chunk)
 {
        struct audit_tree *owner;
+        struct list_head *postponed = audit_killed_trees();
+        int need_prune = 0;
        int n;
        if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
                owner->root = NULL;
                list_del_init(&owner->same_root);
                spin_unlock(&hash_lock);
-                kill_rules(owner);
+                if (!postponed) {
-                list_move(&owner->list, &prune_list);
+                        kill_rules(owner);
-                audit_schedule_prune();
+                        list_move(&owner->list, &prune_list);
+                        need_prune = 1;
+                } else {
+                        list_move(&owner->list, postponed);
+                }
                spin_lock(&hash_lock);
        }
        list_del_rcu(&chunk->hash);
        for (n = 0; n < chunk->count; n++)
                list_del_init(&chunk->owners[n].list);
        spin_unlock(&hash_lock);
+        if (need_prune)
+                audit_schedule_prune();
        mutex_unlock(&audit_filter_mutex);
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ *      event.  Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ *      audit_remove_watch().  Additionally, an audit_watch may exist
+ *      temporarily to assist in searching existing filter data.  Each
+ *      audit_krule holds a reference to its associated watch.
+ */
+struct audit_watch {
+        atomic_t                count;  /* reference count */
+        char                    *path;  /* insertion path */
+        dev_t                   dev;    /* associated superblock device */
+        unsigned long           ino;    /* associated inode number */
+        struct audit_parent     *parent; /* associated parent */
+        struct list_head        wlist;  /* entry in parent->watches list */
+        struct list_head        rules;  /* associated rules */
+};
+struct audit_parent {
+        struct list_head        ilist;  /* entry in inotify registration list */
+        struct list_head        watches; /* associated watches */
+        struct inotify_watch    wdata;  /* inotify watch data */
+        unsigned                flags;  /* status flags */
+};
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID    0x001
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct inotify_watch *i_watch)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        WARN_ON(!list_empty(&parent->watches));
+        kfree(parent);
+}
+void audit_get_watch(struct audit_watch *watch)
+{
+        atomic_inc(&watch->count);
+}
+void audit_put_watch(struct audit_watch *watch)
+{
+        if (atomic_dec_and_test(&watch->count)) {
+                WARN_ON(watch->parent);
+                WARN_ON(!list_empty(&watch->rules));
+                kfree(watch->path);
+                kfree(watch);
+        }
+}
+void audit_remove_watch(struct audit_watch *watch)
+{
+        list_del(&watch->wlist);
+        put_inotify_watch(&watch->parent->wdata);
+        watch->parent = NULL;
+        audit_put_watch(watch); /* match initial get */
+}
+char *audit_watch_path(struct audit_watch *watch)
+{
+        return watch->path;
+}
+struct list_head *audit_watch_rules(struct audit_watch *watch)
+{
+        return &watch->rules;
+}
+unsigned long audit_watch_inode(struct audit_watch *watch)
+{
+        return watch->ino;
+}
+dev_t audit_watch_dev(struct audit_watch *watch)
+{
+        return watch->dev;
+}
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+        struct audit_parent *parent;
+        s32 wd;
+        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+        if (unlikely(!parent))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&parent->watches);
+        parent->flags = 0;
+        inotify_init_watch(&parent->wdata);
+        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+        get_inotify_watch(&parent->wdata);
+        wd = inotify_add_watch(audit_ih, &parent->wdata,
+                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+        if (wd < 0) {
+                audit_free_parent(&parent->wdata);
+                return ERR_PTR(wd);
+        }
+        return parent;
+}
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+        struct audit_watch *watch;
+        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+        if (unlikely(!watch))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&watch->rules);
+        atomic_set(&watch->count, 1);
+        watch->path = path;
+        watch->dev = (dev_t)-1;
+        watch->ino = (unsigned long)-1;
+        return watch;
+}
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+        struct audit_watch *watch;
+        if (!audit_ih)
+                return -EOPNOTSUPP;
+        if (path[0] != '/' || path[len-1] == '/' ||
+            krule->listnr != AUDIT_FILTER_EXIT ||
+            op != Audit_equal ||
+            krule->inode_f || krule->watch || krule->tree)
+                return -EINVAL;
+        watch = audit_init_watch(path);
+        if (IS_ERR(watch))
+                return PTR_ERR(watch);
+        audit_get_watch(watch);
+        krule->watch = watch;
+        return 0;
+}
+/* Duplicate the given audit watch.  The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+        char *path;
+        struct audit_watch *new;
+        path = kstrdup(old->path, GFP_KERNEL);
+        if (unlikely(!path))
+                return ERR_PTR(-ENOMEM);
+        new = audit_init_watch(path);
+        if (IS_ERR(new)) {
+                kfree(path);
+                goto out;
+        }
+        new->dev = old->dev;
+        new->ino = old->ino;
+        get_inotify_watch(&old->parent->wdata);
+        new->parent = old->parent;
+out:
+        return new;
+}
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+        if (audit_enabled) {
+                struct audit_buffer *ab;
+                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+                audit_log_format(ab, "auid=%u ses=%u op=",
+                                 audit_get_loginuid(current),
+                                 audit_get_sessionid(current));
+                audit_log_string(ab, op);
+                audit_log_format(ab, " path=");
+                audit_log_untrustedstring(ab, w->path);
+                audit_log_key(ab, r->filterkey);
+                audit_log_format(ab, " list=%d res=1", r->listnr);
+                audit_log_end(ab);
+        }
+}
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+                               const char *dname, dev_t dev,
+                               unsigned long ino, unsigned invalidating)
+{
+        struct audit_watch *owatch, *nwatch, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *oentry, *nentry;
+        mutex_lock(&audit_filter_mutex);
+        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+                if (audit_compare_dname_path(dname, owatch->path, NULL))
+                        continue;
+                /* If the update involves invalidating rules, do the inode-based
+                 * filtering now, so we don't omit records. */
+                if (invalidating && current->audit_context)
+                        audit_filter_inodes(current, current->audit_context);
+                nwatch = audit_dupe_watch(owatch);
+                if (IS_ERR(nwatch)) {
+                        mutex_unlock(&audit_filter_mutex);
+                        audit_panic("error updating watch, skipping");
+                        return;
+                }
+                nwatch->dev = dev;
+                nwatch->ino = ino;
+                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+                        oentry = container_of(r, struct audit_entry, rule);
+                        list_del(&oentry->rule.rlist);
+                        list_del_rcu(&oentry->list);
+                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
+                        if (IS_ERR(nentry)) {
+                                list_del(&oentry->rule.list);
+                                audit_panic("error updating watch, removing");
+                        } else {
+                                int h = audit_hash_ino((u32)ino);
+                                list_add(&nentry->rule.rlist, &nwatch->rules);
+                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+                                list_replace(&oentry->rule.list,
+                                             &nentry->rule.list);
+                        }
+                        audit_watch_log_rule_change(r, owatch, "updated rules");
+                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
+                }
+                audit_remove_watch(owatch);
+                goto add_watch_to_parent; /* event applies to a single watch */
+        }
+        mutex_unlock(&audit_filter_mutex);
+        return;
+add_watch_to_parent:
+        list_add(&nwatch->wlist, &parent->watches);
+        mutex_unlock(&audit_filter_mutex);
+        return;
+}
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+        struct audit_watch *w, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *e;
+        mutex_lock(&audit_filter_mutex);
+        parent->flags |= AUDIT_PARENT_INVALID;
+        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+                        e = container_of(r, struct audit_entry, rule);
+                        audit_watch_log_rule_change(r, w, "remove rule");
+                        list_del(&r->rlist);
+                        list_del(&r->list);
+                        list_del_rcu(&e->list);
+                        call_rcu(&e->rcu, audit_free_rule_rcu);
+                }
+                audit_remove_watch(w);
+        }
+        mutex_unlock(&audit_filter_mutex);
+}
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+void audit_inotify_unregister(struct list_head *in_list)
+{
+        struct audit_parent *p, *n;
+        list_for_each_entry_safe(p, n, in_list, ilist) {
+                list_del(&p->ilist);
+                inotify_rm_watch(audit_ih, &p->wdata);
+                /* the unpin matching the pin in audit_do_del_rule() */
+                unpin_inotify_watch(&p->wdata);
+        }
+}
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+{
+        struct nameidata *ndparent, *ndwatch;
+        int err;
+        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+        if (unlikely(!ndparent))
+                return -ENOMEM;
+        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+        if (unlikely(!ndwatch)) {
+                kfree(ndparent);
+                return -ENOMEM;
+        }
+        err = path_lookup(path, LOOKUP_PARENT, ndparent);
+        if (err) {
+                kfree(ndparent);
+                kfree(ndwatch);
+                return err;
+        }
+        err = path_lookup(path, 0, ndwatch);
+        if (err) {
+                kfree(ndwatch);
+                ndwatch = NULL;
+        }
+        *ndp = ndparent;
+        *ndw = ndwatch;
+        return 0;
+}
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+        if (ndp) {
+                path_put(&ndp->path);
+                kfree(ndp);
+        }
+        if (ndw) {
+                path_put(&ndw->path);
+                kfree(ndw);
+        }
+}
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+                                struct audit_parent *parent)
+{
+        struct audit_watch *w, *watch = krule->watch;
+        int watch_found = 0;
+        list_for_each_entry(w, &parent->watches, wlist) {
+                if (strcmp(watch->path, w->path))
+                        continue;
+                watch_found = 1;
+                /* put krule's and initial refs to temporary watch */
+                audit_put_watch(watch);
+                audit_put_watch(watch);
+                audit_get_watch(w);
+                krule->watch = watch = w;
+                break;
+        }
+        if (!watch_found) {
+                get_inotify_watch(&parent->wdata);
+                watch->parent = parent;
+                list_add(&watch->wlist, &parent->watches);
+        }
+        list_add(&krule->rlist, &watch->rules);
+}
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule)
+{
+        struct audit_watch *watch = krule->watch;
+        struct inotify_watch *i_watch;
+        struct audit_parent *parent;
+        struct nameidata *ndp = NULL, *ndw = NULL;
+        int ret = 0;
+        mutex_unlock(&audit_filter_mutex);
+        /* Avoid calling path_lookup under audit_filter_mutex. */
+        ret = audit_get_nd(watch->path, &ndp, &ndw);
+        if (ret) {
+                /* caller expects mutex locked */
+                mutex_lock(&audit_filter_mutex);
+                goto error;
+        }
+        /* update watch filter fields */
+        if (ndw) {
+                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+                watch->ino = ndw->path.dentry->d_inode->i_ino;
+        }
+        /* The audit_filter_mutex must not be held during inotify calls because
+         * we hold it during inotify event callback processing.  If an existing
+         * inotify watch is found, inotify_find_watch() grabs a reference before
+         * returning.
+         */
+        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+                               &i_watch) < 0) {
+                parent = audit_init_parent(ndp);
+                if (IS_ERR(parent)) {
+                        /* caller expects mutex locked */
+                        mutex_lock(&audit_filter_mutex);
+                        ret = PTR_ERR(parent);
+                        goto error;
+                }
+        } else
+                parent = container_of(i_watch, struct audit_parent, wdata);
+        mutex_lock(&audit_filter_mutex);
+        /* parent was moved before we took audit_filter_mutex */
+        if (parent->flags & AUDIT_PARENT_INVALID)
+                ret = -ENOENT;
+        else
+                audit_add_to_parent(krule, parent);
+        /* match get in audit_init_parent or inotify_find_watch */
+        put_inotify_watch(&parent->wdata);
+error:
+        audit_put_nd(ndp, ndw);         /* NULL args OK */
+        return ret;
+}
+void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+{
+        struct audit_watch *watch = krule->watch;
+        struct audit_parent *parent = watch->parent;
+        list_del(&krule->rlist);
+        if (list_empty(&watch->rules)) {
+                audit_remove_watch(watch);
+                if (list_empty(&parent->watches)) {
+                        /* Put parent on the inotify un-registration
+                         * list.  Grab a reference before releasing
+                         * audit_filter_mutex, to be released in
+                         * audit_inotify_unregister().
+                         * If filesystem is going away, just leave
+                         * the sucker alone, eviction will take
+                         * care of it. */
+                        if (pin_inotify_watch(&parent->wdata))
+                                list_add(&parent->ilist, list);
+                }
+        }
+}
+/* Update watch data in audit rules based on inotify events. */
+static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+                         u32 cookie, const char *dname, struct inode *inode)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+                audit_update_watch(parent, dname, inode->i_sb->s_dev,
+                                   inode->i_ino, 0);
+        else if (mask & (IN_DELETE|IN_MOVED_FROM))
+                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+        /* inotify automatically removes the watch and sends IN_IGNORED */
+        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+                audit_remove_parent_watches(parent);
+        /* inotify does not remove the watch, so remove it manually */
+        else if(mask & IN_MOVE_SELF) {
+                audit_remove_parent_watches(parent);
+                inotify_remove_watch_locked(audit_ih, i_watch);
+        } else if (mask & IN_IGNORED)
+                put_inotify_watch(i_watch);
+}
+static const struct inotify_operations audit_inotify_ops = {
+        .handle_event   = audit_handle_ievent,
+        .destroy_watch  = audit_free_parent,
+};
+static int __init audit_watch_init(void)
+{
+        audit_ih = inotify_init(&audit_inotify_ops);
+        if (IS_ERR(audit_ih))
+                audit_panic("cannot initialize inotify handle");
+        return 0;
+}
+subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
-#include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
@@ -44,36 +43,6 @@
 *              be written directly provided audit_filter_mutex is held.
 */
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- *      event.  Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- *      audit_remove_watch().  Additionally, an audit_watch may exist
- *      temporarily to assist in searching existing filter data.  Each
- *      audit_krule holds a reference to its associated watch.
- */
-struct audit_parent {
-        struct list_head        ilist;  /* entry in inotify registration list */
-        struct list_head        watches; /* associated watches */
-        struct inotify_watch    wdata;  /* inotify watch data */
-        unsigned                flags;  /* status flags */
-};
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID    0x001
 /* Audit filter lists, defined in <linux/audit.h> */
 struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-void audit_free_parent(struct inotify_watch *i_watch)
-{
-        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
-        WARN_ON(!list_empty(&parent->watches));
-        kfree(parent);
-}
-static inline void audit_get_watch(struct audit_watch *watch)
-{
-        atomic_inc(&watch->count);
-}
-static void audit_put_watch(struct audit_watch *watch)
-{
-        if (atomic_dec_and_test(&watch->count)) {
-                WARN_ON(watch->parent);
-                WARN_ON(!list_empty(&watch->rules));
-                kfree(watch->path);
-                kfree(watch);
-        }
-}
-static void audit_remove_watch(struct audit_watch *watch)
-{
-        list_del(&watch->wlist);
-        put_inotify_watch(&watch->parent->wdata);
-        watch->parent = NULL;
-        audit_put_watch(watch); /* match initial get */
-}
 static inline void audit_free_rule(struct audit_entry *e)
 {
        int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
        audit_free_rule(e);
 }
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
-        struct audit_parent *parent;
-        s32 wd;
-        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
-        if (unlikely(!parent))
-                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&parent->watches);
-        parent->flags = 0;
-        inotify_init_watch(&parent->wdata);
-        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
-        get_inotify_watch(&parent->wdata);
-        wd = inotify_add_watch(audit_ih, &parent->wdata,
-                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
-        if (wd < 0) {
-                audit_free_parent(&parent->wdata);
-                return ERR_PTR(wd);
-        }
-        return parent;
-}
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
-        struct audit_watch *watch;
-        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
-        if (unlikely(!watch))
-                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&watch->rules);
-        atomic_set(&watch->count, 1);
-        watch->path = path;
-        watch->dev = (dev_t)-1;
-        watch->ino = (unsigned long)-1;
-        return watch;
-}
 /* Initialize an audit filterlist entry. */
 static inline struct audit_entry *audit_init_entry(u32 field_count)
 {
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
        return 0;
 }
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
-                          u32 op)
-{
-        struct audit_watch *watch;
-        if (!audit_ih)
-                return -EOPNOTSUPP;
-        if (path[0] != '/' || path[len-1] == '/' ||
-            krule->listnr != AUDIT_FILTER_EXIT ||
-            op != Audit_equal ||
-            krule->inode_f || krule->watch || krule->tree)
-                return -EINVAL;
-        watch = audit_init_watch(path);
-        if (IS_ERR(watch))
-                return PTR_ERR(watch);
-        audit_get_watch(watch);
-        krule->watch = watch;
-        return 0;
-}
 static __u32 *classes[AUDIT_SYSCALL_CLASSES];
 int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        break;
                case AUDIT_WATCH:
                        data->buflen += data->values[i] =
-                                audit_pack_string(&bufp, krule->watch->path);
+                                audit_pack_string(&bufp,
+                                                  audit_watch_path(krule->watch));
                        break;
                case AUDIT_DIR:
                        data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                                return 1;
                        break;
                case AUDIT_WATCH:
-                        if (strcmp(a->watch->path, b->watch->path))
+                        if (strcmp(audit_watch_path(a->watch),
+                                   audit_watch_path(b->watch)))
                                return 1;
                        break;
                case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
        return 0;
 }
-/* Duplicate the given audit watch.  The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
-        char *path;
-        struct audit_watch *new;
-        path = kstrdup(old->path, GFP_KERNEL);
-        if (unlikely(!path))
-                return ERR_PTR(-ENOMEM);
-        new = audit_init_watch(path);
-        if (IS_ERR(new)) {
-                kfree(path);
-                goto out;
-        }
-        new->dev = old->dev;
-        new->ino = old->ino;
-        get_inotify_watch(&old->parent->wdata);
-        new->parent = old->parent;
-out:
-        return new;
-}
 /* Duplicate LSM field information.  The lsm_rule is opaque, so must be
 * re-initialized. */
 static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
 * rule with the new rule in the filterlist, then free the old rule.
 * The rlist element is undefined; list manipulations are handled apart from
 * the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
-                                           struct audit_watch *watch)
+                                    struct audit_watch *watch)
 {
        u32 fcount = old->field_count;
        struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
        return entry;
 }
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
-                               const char *dname, dev_t dev,
-                               unsigned long ino, unsigned invalidating)
-{
-        struct audit_watch *owatch, *nwatch, *nextw;
-        struct audit_krule *r, *nextr;
-        struct audit_entry *oentry, *nentry;
-        mutex_lock(&audit_filter_mutex);
-        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-                if (audit_compare_dname_path(dname, owatch->path, NULL))
-                        continue;
-                /* If the update involves invalidating rules, do the inode-based
-                 * filtering now, so we don't omit records. */
-                if (invalidating && current->audit_context)
-                        audit_filter_inodes(current, current->audit_context);
-                nwatch = audit_dupe_watch(owatch);
-                if (IS_ERR(nwatch)) {
-                        mutex_unlock(&audit_filter_mutex);
-                        audit_panic("error updating watch, skipping");
-                        return;
-                }
-                nwatch->dev = dev;
-                nwatch->ino = ino;
-                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-                        oentry = container_of(r, struct audit_entry, rule);
-                        list_del(&oentry->rule.rlist);
-                        list_del_rcu(&oentry->list);
-                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
-                        if (IS_ERR(nentry)) {
-                                list_del(&oentry->rule.list);
-                                audit_panic("error updating watch, removing");
-                        } else {
-                                int h = audit_hash_ino((u32)ino);
-                                list_add(&nentry->rule.rlist, &nwatch->rules);
-                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
-                                list_replace(&oentry->rule.list,
-                                             &nentry->rule.list);
-                        }
-                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
-                }
-                if (audit_enabled) {
-                        struct audit_buffer *ab;
-                        ab = audit_log_start(NULL, GFP_NOFS,
-                                AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "auid=%u ses=%u",
-                                audit_get_loginuid(current),
-                                audit_get_sessionid(current));
-                        audit_log_format(ab,
-                                " op=updated rules specifying path=");
-                        audit_log_untrustedstring(ab, owatch->path);
-                        audit_log_format(ab, " with dev=%u ino=%lu\n",
-                                 dev, ino);
-                        audit_log_format(ab, " list=%d res=1", r->listnr);
-                        audit_log_end(ab);
-                }
-                audit_remove_watch(owatch);
-                goto add_watch_to_parent; /* event applies to a single watch */
-        }
-        mutex_unlock(&audit_filter_mutex);
-        return;
-add_watch_to_parent:
-        list_add(&nwatch->wlist, &parent->watches);
-        mutex_unlock(&audit_filter_mutex);
-        return;
-}
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
-        struct audit_watch *w, *nextw;
-        struct audit_krule *r, *nextr;
-        struct audit_entry *e;
-        mutex_lock(&audit_filter_mutex);
-        parent->flags |= AUDIT_PARENT_INVALID;
-        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
-                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
-                        e = container_of(r, struct audit_entry, rule);
-                        if (audit_enabled) {
-                                struct audit_buffer *ab;
-                                ab = audit_log_start(NULL, GFP_NOFS,
-                                        AUDIT_CONFIG_CHANGE);
-                                audit_log_format(ab, "auid=%u ses=%u",
-                                        audit_get_loginuid(current),
-                                        audit_get_sessionid(current));
-                                audit_log_format(ab, " op=remove rule path=");
-                                audit_log_untrustedstring(ab, w->path);
-                                if (r->filterkey) {
-                                        audit_log_format(ab, " key=");
-                                        audit_log_untrustedstring(ab,
-                                                        r->filterkey);
-                                } else
-                                        audit_log_format(ab, " key=(null)");
-                                audit_log_format(ab, " list=%d res=1",
-                                        r->listnr);
-                                audit_log_end(ab);
-                        }
-                        list_del(&r->rlist);
-                        list_del(&r->list);
-                        list_del_rcu(&e->list);
-                        call_rcu(&e->rcu, audit_free_rule_rcu);
-                }
-                audit_remove_watch(w);
-        }
-        mutex_unlock(&audit_filter_mutex);
-}
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
-        struct audit_parent *p, *n;
-        list_for_each_entry_safe(p, n, in_list, ilist) {
-                list_del(&p->ilist);
-                inotify_rm_watch(audit_ih, &p->wdata);
-                /* the unpin matching the pin in audit_do_del_rule() */
-                unpin_inotify_watch(&p->wdata);
-        }
-}
 /* Find an existing audit rule.
 * Caller must hold audit_filter_mutex to prevent stale rule data. */
 static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
        return found;
 }
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
-                        struct nameidata **ndw)
-{
-        struct nameidata *ndparent, *ndwatch;
-        int err;
-        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
-        if (unlikely(!ndparent))
-                return -ENOMEM;
-        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
-        if (unlikely(!ndwatch)) {
-                kfree(ndparent);
-                return -ENOMEM;
-        }
-        err = path_lookup(path, LOOKUP_PARENT, ndparent);
-        if (err) {
-                kfree(ndparent);
-                kfree(ndwatch);
-                return err;
-        }
-        err = path_lookup(path, 0, ndwatch);
-        if (err) {
-                kfree(ndwatch);
-                ndwatch = NULL;
-        }
-        *ndp = ndparent;
-        *ndw = ndwatch;
-        return 0;
-}
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-        if (ndp) {
-                path_put(&ndp->path);
-                kfree(ndp);
-        }
-        if (ndw) {
-                path_put(&ndw->path);
-                kfree(ndw);
-        }
-}
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
-                                struct audit_parent *parent)
-{
-        struct audit_watch *w, *watch = krule->watch;
-        int watch_found = 0;
-        list_for_each_entry(w, &parent->watches, wlist) {
-                if (strcmp(watch->path, w->path))
-                        continue;
-                watch_found = 1;
-                /* put krule's and initial refs to temporary watch */
-                audit_put_watch(watch);
-                audit_put_watch(watch);
-                audit_get_watch(w);
-                krule->watch = watch = w;
-                break;
-        }
-        if (!watch_found) {
-                get_inotify_watch(&parent->wdata);
-                watch->parent = parent;
-                list_add(&watch->wlist, &parent->watches);
-        }
-        list_add(&krule->rlist, &watch->rules);
-}
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
-                           struct nameidata *ndw)
-{
-        struct audit_watch *watch = krule->watch;
-        struct inotify_watch *i_watch;
-        struct audit_parent *parent;
-        int ret = 0;
-        /* update watch filter fields */
-        if (ndw) {
-                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-                watch->ino = ndw->path.dentry->d_inode->i_ino;
-        }
-        /* The audit_filter_mutex must not be held during inotify calls because
-         * we hold it during inotify event callback processing.  If an existing
-         * inotify watch is found, inotify_find_watch() grabs a reference before
-         * returning.
-         */
-        mutex_unlock(&audit_filter_mutex);
-        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-                               &i_watch) < 0) {
-                parent = audit_init_parent(ndp);
-                if (IS_ERR(parent)) {
-                        /* caller expects mutex locked */
-                        mutex_lock(&audit_filter_mutex);
-                        return PTR_ERR(parent);
-                }
-        } else
-                parent = container_of(i_watch, struct audit_parent, wdata);
-        mutex_lock(&audit_filter_mutex);
-        /* parent was moved before we took audit_filter_mutex */
-        if (parent->flags & AUDIT_PARENT_INVALID)
-                ret = -ENOENT;
-        else
-                audit_add_to_parent(krule, parent);
-        /* match get in audit_init_parent or inotify_find_watch */
-        put_inotify_watch(&parent->wdata);
-        return ret;
-}
 static u64 prio_low = ~0ULL/2;
 static u64 prio_high = ~0ULL/2 - 1;
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
        struct audit_entry *e;
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
-        struct nameidata *ndp = NULL, *ndw = NULL;
        struct list_head *list;
        int h, err;
 #ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
-        mutex_unlock(&audit_filter_mutex);
        if (e) {
+                mutex_unlock(&audit_filter_mutex);
                err = -EEXIST;
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
                goto error;
        }
-        /* Avoid calling path_lookup under audit_filter_mutex. */
-        if (watch) {
-                err = audit_get_nd(watch->path, &ndp, &ndw);
-                if (err)
-                        goto error;
-        }
-        mutex_lock(&audit_filter_mutex);
        if (watch) {
                /* audit_filter_mutex is dropped and re-taken during this call */
-                err = audit_add_watch(&entry->rule, ndp, ndw);
+                err = audit_add_watch(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        goto error;
                }
-                h = audit_hash_ino((u32)watch->ino);
+                /* entry->rule.watch may have changed during audit_add_watch() */
+                watch = entry->rule.watch;
+                h = audit_hash_ino((u32)audit_watch_inode(watch));
                list = &audit_inode_hash[h];
        }
        if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #endif
        mutex_unlock(&audit_filter_mutex);
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
        return 0;
 error:
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
        if (watch)
                audit_put_watch(watch); /* tmp watch, matches initial get */
        return err;
@@ -1372,7 +945,7 @@ error:
 static inline int audit_del_rule(struct audit_entry *entry)
 {
        struct audit_entry  *e;
-        struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
                goto out;
        }
-        watch = e->rule.watch;
+        if (e->rule.watch)
-        if (watch) {
+                audit_remove_watch_rule(&e->rule, &inotify_list);
-                struct audit_parent *parent = watch->parent;
-                list_del(&e->rule.rlist);
-                if (list_empty(&watch->rules)) {
-                        audit_remove_watch(watch);
-                        if (list_empty(&parent->watches)) {
-                                /* Put parent on the inotify un-registration
-                                 * list.  Grab a reference before releasing
-                                 * audit_filter_mutex, to be released in
-                                 * audit_inotify_unregister().
-                                 * If filesystem is going away, just leave
-                                 * the sucker alone, eviction will take
-                                 * care of it.
-                                 */
-                                if (pin_inotify_watch(&parent->wdata))
-                                        list_add(&parent->ilist, &inotify_list);
-                        }
-                }
-        }
        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
                audit_inotify_unregister(&inotify_list);
 out:
-        if (tmp_watch)
+        if (watch)
-                audit_put_watch(tmp_watch); /* match initial get */
+                audit_put_watch(watch); /* match initial get */
        if (tree)
                audit_put_tree(tree);   /* that's the temporary one */
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
                        security_release_secctx(ctx, len);
                }
        }
-        audit_log_format(ab, " op=%s rule key=", action);
+        audit_log_format(ab, " op=");
-        if (rule->filterkey)
+        audit_log_string(ab, action);
-                audit_log_untrustedstring(ab, rule->filterkey);
+        audit_log_key(ab, rule->filterkey);
-        else
-                audit_log_format(ab, "(null)");
        audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
        audit_log_end(ab);
 }
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                        return PTR_ERR(entry);
                err = audit_add_rule(entry);
-                audit_log_rule_change(loginuid, sessionid, sid, "add",
+                audit_log_rule_change(loginuid, sessionid, sid, "add rule",
                                      &entry->rule, !err);
                if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                        return PTR_ERR(entry);
                err = audit_del_rule(entry);
-                audit_log_rule_change(loginuid, sessionid, sid, "remove",
+                audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
                                      &entry->rule, !err);
                audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
                list_del(&r->list);
        } else {
                if (watch) {
-                        list_add(&nentry->rule.rlist, &watch->rules);
+                        list_add(&nentry->rule.rlist, audit_watch_rules(watch));
                        list_del(&r->rlist);
                } else if (tree)
                        list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
        return err;
 }
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
-                         u32 cookie, const char *dname, struct inode *inode)
-{
-        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
-        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
-                audit_update_watch(parent, dname, inode->i_sb->s_dev,
-                                   inode->i_ino, 0);
-        else if (mask & (IN_DELETE|IN_MOVED_FROM))
-                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-        /* inotify automatically removes the watch and sends IN_IGNORED */
-        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
-                audit_remove_parent_watches(parent);
-        /* inotify does not remove the watch, so remove it manually */
-        else if(mask & IN_MOVE_SELF) {
-                audit_remove_parent_watches(parent);
-                inotify_remove_watch_locked(audit_ih, i_watch);
-        } else if (mask & IN_IGNORED)
-                put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
        struct audit_tree_refs *trees, *first_trees;
        int tree_count;
+        struct list_head killed_trees;
        int type;
        union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_WATCH:
-                        if (name && rule->watch->ino != (unsigned long)-1)
+                        if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
-                                result = (name->dev == rule->watch->dev &&
+                                result = (name->dev == audit_watch_dev(rule->watch) &&
-                                          name->ino == rule->watch->ino);
+                                          name->ino == audit_watch_inode(rule->watch));
                        break;
                case AUDIT_DIR:
                        if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
        if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
                return NULL;
        audit_zero_context(context, state);
+        INIT_LIST_HEAD(&context->killed_trees);
        return context;
 }
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
        char arg_num_len_buf[12];
        const char __user *tmp_p = p;
-        /* how many digits are in arg_num? 3 is the length of " a=" */
+        /* how many digits are in arg_num? 5 is the length of ' a=""' */
-        size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+        size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
        size_t len, len_left, to_send;
        size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
        unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
                if (has_cntl)
                        audit_log_n_hex(*ab, buf, to_send);
                else
-                        audit_log_format(*ab, "\"%s\"", buf);
+                        audit_log_string(*ab, buf);
                p += to_send;
                len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        audit_log_task_info(ab, tsk);
-        if (context->filterkey) {
+        audit_log_key(ab, context->filterkey);
-                audit_log_format(ab, " key=");
-                audit_log_untrustedstring(ab, context->filterkey);
-        } else
-                audit_log_format(ab, " key=(null)");
        audit_log_end(ab);
        for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
        /* that can happen only if we are called from do_exit() */
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
                audit_log_exit(context, tsk);
+        if (!list_empty(&context->killed_trees))
+                audit_kill_trees(&context->killed_trees);
        audit_free_context(context);
 }
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
        context->in_syscall = 0;
        context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+        if (!list_empty(&context->killed_trees))
+                audit_kill_trees(&context->killed_trees);
        if (context->previous) {
                struct audit_context *new_context = context->previous;
                context->previous  = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
        audit_log_format(ab, " sig=%ld", signr);
        audit_log_end(ab);
 }
+struct list_head *audit_killed_trees(void)
+{
+        struct audit_context *ctx = current->audit_context;
+        if (likely(!ctx || !ctx->in_syscall))
+                return NULL;
+        return &ctx->killed_trees;
+}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
         * an ongoing cpu hotplug operation.
         */
        int refcount;
-} cpu_hotplug;
+} cpu_hotplug = {
+        .active_writer = NULL,
-void __init cpu_hotplug_init(void)
+        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-{
+        .refcount = 0,
-        cpu_hotplug.active_writer = NULL;
+};
-        mutex_init(&cpu_hotplug.lock);
-        cpu_hotplug.refcount = 0;
-}
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/exit.c b/kernel/exit.c
index 13ae64001fec..628d41f0dd54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1197,8 +1197,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        }
        traced = ptrace_reparented(p);
+        /*
-        if (likely(!traced)) {
+         * It can be ptraced but not reparented, check
+         * !task_detached() to filter out sub-threads.
+         */
+        if (likely(!traced) && likely(!task_detached(p))) {
                struct signal_struct *psig;
                struct signal_struct *sig;
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..794c862125fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
        drop_futex_key_refs(key);
 }
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr:      pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+                                 1, 1, 0, NULL, NULL);
+        return ret < 0 ? ret : 0;
+}
 /**
 * futex_top_waiter() - Return the highest priority waiter on a futex
 * @hb:     the hash bucket the futex_q's reside in
@@ -896,7 +915,6 @@ retry:
 retry_private:
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
-                u32 dummy;
                double_unlock_hb(hb1, hb2);
@@ -914,7 +932,7 @@ retry_private:
                        goto out_put_keys;
                }
-                ret = get_user(dummy, uaddr2);
+                ret = fault_in_user_writeable(uaddr2);
                if (ret)
                        goto out_put_keys;
@@ -1204,7 +1222,7 @@ retry_private:
                        double_unlock_hb(hb1, hb2);
                        put_futex_key(fshared, &key2);
                        put_futex_key(fshared, &key1);
-                        ret = get_user(curval2, uaddr2);
+                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
                        goto out;
@@ -1482,7 +1500,7 @@ retry:
 handle_fault:
        spin_unlock(q->lock_ptr);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        spin_lock(q->lock_ptr);
@@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-        u32 uval;
        struct futex_q q;
        int res, ret;
@@ -1909,16 +1926,9 @@ out:
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 uaddr_faulted:
-        /*
-         * We have to r/w  *(int __user *)uaddr, and we have to modify it
-         * atomically.  Therefore, if we continue to fault after get_user()
-         * below, we need to handle the fault ourselves, while still holding
-         * the mmap_sem.  This can occur if the uaddr is under contention as
-         * we have to drop the mmap_sem in order to call get_user().
-         */
        queue_unlock(&q, hb);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        if (ret)
                goto out_put_key;
@@ -2013,17 +2023,10 @@ out:
        return ret;
 pi_faulted:
-        /*
-         * We have to r/w  *(int __user *)uaddr, and we have to modify it
-         * atomically.  Therefore, if we continue to fault after get_user()
-         * below, we need to handle the fault ourselves, while still holding
-         * the mmap_sem.  This can occur if the uaddr is under contention as
-         * we have to drop the mmap_sem in order to call get_user().
-         */
        spin_unlock(&hb->lock);
        put_futex_key(fshared, &key);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        if (!ret)
                goto retry;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aaf5c9d05770..50da67672901 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -856,7 +856,7 @@ EXPORT_SYMBOL(free_irq);
 *      still called in hard interrupt context and has to check
 *      whether the interrupt originates from the device. If yes it
 *      needs to disable the interrupt on the device and return
- *      IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *      IRQ_WAKE_THREAD which will wake up the handler thread and run
 *      @thread_fn. This split handler design is necessary to support
 *      shared interrupts.
 *
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..1a933a221ea4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)
 static void get_ctx(struct perf_counter_context *ctx)
 {
-        atomic_inc(&ctx->refcount);
+        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 static void free_ctx(struct rcu_head *head)
@@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                        spin_unlock_irqrestore(&ctx->lock, *flags);
                        goto retry;
                }
+                if (!atomic_inc_not_zero(&ctx->refcount)) {
+                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        ctx = NULL;
+                }
        }
        rcu_read_unlock();
        return ctx;
@@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
-                get_ctx(ctx);
                spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
@@ -1283,7 +1287,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
                if (!interrupts) {
                        perf_disable();
                        counter->pmu->disable(counter);
-                        atomic_set(&hwc->period_left, 0);
+                        atomic64_set(&hwc->period_left, 0);
                        counter->pmu->enable(counter);
                        perf_enable();
                }
@@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
                        put_ctx(parent_ctx);
                        ctx->parent_ctx = NULL;         /* no longer a clone */
                }
-                /*
-                 * Get an extra reference before dropping the lock so that
-                 * this context won't get freed if the task exits.
-                 */
-                get_ctx(ctx);
                spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -1553,7 +1552,7 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-        u64 values[3];
+        u64 values[4];
        int n;
        /*
@@ -1620,22 +1619,6 @@ static void perf_counter_reset(struct perf_counter *counter)
        perf_counter_update_userpage(counter);
 }
-static void perf_counter_for_each_sibling(struct perf_counter *counter,
-                                          void (*func)(struct perf_counter *))
-{
-        struct perf_counter_context *ctx = counter->ctx;
-        struct perf_counter *sibling;
-        WARN_ON_ONCE(ctx->parent_ctx);
-        mutex_lock(&ctx->mutex);
-        counter = counter->group_leader;
-        func(counter);
-        list_for_each_entry(sibling, &counter->sibling_list, list_entry)
-                func(sibling);
-        mutex_unlock(&ctx->mutex);
-}
 /*
 * Holding the top-level counter's child_mutex means that any
 * descendant process that has inherited this counter will block
@@ -1658,14 +1641,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 static void perf_counter_for_each(struct perf_counter *counter,
                                  void (*func)(struct perf_counter *))
 {
-        struct perf_counter *child;
+        struct perf_counter_context *ctx = counter->ctx;
+        struct perf_counter *sibling;
-        WARN_ON_ONCE(counter->ctx->parent_ctx);
+        WARN_ON_ONCE(ctx->parent_ctx);
-        mutex_lock(&counter->child_mutex);
+        mutex_lock(&ctx->mutex);
-        perf_counter_for_each_sibling(counter, func);
+        counter = counter->group_leader;
-        list_for_each_entry(child, &counter->child_list, child_list)
-                perf_counter_for_each_sibling(child, func);
+        perf_counter_for_each_child(counter, func);
-        mutex_unlock(&counter->child_mutex);
+        func(counter);
+        list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+                perf_counter_for_each_child(counter, func);
+        mutex_unlock(&ctx->mutex);
 }
 static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1806,6 +1793,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct perf_mmap_data *data;
        int ret = VM_FAULT_SIGBUS;
+        if (vmf->flags & FAULT_FLAG_MKWRITE) {
+                if (vmf->pgoff == 0)
+                        ret = 0;
+                return ret;
+        }
        rcu_read_lock();
        data = rcu_dereference(counter->data);
        if (!data)
@@ -1819,9 +1812,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                if ((unsigned)nr > data->nr_pages)
                        goto unlock;
+                if (vmf->flags & FAULT_FLAG_WRITE)
+                        goto unlock;
                vmf->page = virt_to_page(data->data_pages[nr]);
        }
        get_page(vmf->page);
+        vmf->page->mapping = vma->vm_file->f_mapping;
+        vmf->page->index   = vmf->pgoff;
        ret = 0;
 unlock:
        rcu_read_unlock();
@@ -1874,6 +1874,14 @@ fail:
        return -ENOMEM;
 }
+static void perf_mmap_free_page(unsigned long addr)
+{
+        struct page *page = virt_to_page(addr);
+        page->mapping = NULL;
+        __free_page(page);
+}
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
        struct perf_mmap_data *data;
@@ -1881,9 +1889,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-        free_page((unsigned long)data->user_page);
+        perf_mmap_free_page((unsigned long)data->user_page);
        for (i = 0; i < data->nr_pages; i++)
-                free_page((unsigned long)data->data_pages[i]);
+                perf_mmap_free_page((unsigned long)data->data_pages[i]);
        kfree(data);
 }
@@ -1920,9 +1929,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 }
 static struct vm_operations_struct perf_mmap_vmops = {
-        .open  = perf_mmap_open,
+        .open           = perf_mmap_open,
-        .close = perf_mmap_close,
+        .close          = perf_mmap_close,
-        .fault = perf_mmap_fault,
+        .fault          = perf_mmap_fault,
+        .page_mkwrite   = perf_mmap_fault,
 };
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +1946,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        long user_extra, extra;
        int ret = 0;
-        if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
        vma_size = vma->vm_end - vma->vm_start;
@@ -1995,10 +2005,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        atomic_long_add(user_extra, &user->locked_vm);
        vma->vm_mm->locked_vm += extra;
        counter->data->nr_locked = extra;
+        if (vma->vm_flags & VM_WRITE)
+                counter->data->writable = 1;
 unlock:
        mutex_unlock(&counter->mmap_mutex);
-        vma->vm_flags &= ~VM_MAYWRITE;
        vma->vm_flags |= VM_RESERVED;
        vma->vm_ops = &perf_mmap_vmops;
@@ -2175,11 +2187,38 @@ struct perf_output_handle {
        unsigned long           head;
        unsigned long           offset;
        int                     nmi;
-        int                     overflow;
+        int                     sample;
        int                     locked;
        unsigned long           flags;
 };
+static bool perf_output_space(struct perf_mmap_data *data,
+                              unsigned int offset, unsigned int head)
+{
+        unsigned long tail;
+        unsigned long mask;
+        if (!data->writable)
+                return true;
+        mask = (data->nr_pages << PAGE_SHIFT) - 1;
+        /*
+         * Userspace could choose to issue a mb() before updating the tail
+         * pointer. So that all reads will be completed before the write is
+         * issued.
+         */
+        tail = ACCESS_ONCE(data->user_page->data_tail);
+        smp_rmb();
+        offset = (offset - tail) & mask;
+        head   = (head   - tail) & mask;
+        if ((int)(head - offset) < 0)
+                return false;
+        return true;
+}
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
        atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,12 +2309,57 @@ out:
        local_irq_restore(handle->flags);
 }
+static void perf_output_copy(struct perf_output_handle *handle,
+                             const void *buf, unsigned int len)
+{
+        unsigned int pages_mask;
+        unsigned int offset;
+        unsigned int size;
+        void **pages;
+        offset          = handle->offset;
+        pages_mask      = handle->data->nr_pages - 1;
+        pages           = handle->data->data_pages;
+        do {
+                unsigned int page_offset;
+                int nr;
+                nr          = (offset >> PAGE_SHIFT) & pages_mask;
+                page_offset = offset & (PAGE_SIZE - 1);
+                size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+                memcpy(pages[nr] + page_offset, buf, size);
+                len         -= size;
+                buf         += size;
+                offset      += size;
+        } while (len);
+        handle->offset = offset;
+        /*
+         * Check we didn't copy past our reservation window, taking the
+         * possible unsigned int wrap into account.
+         */
+        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+#define perf_output_put(handle, x) \
+        perf_output_copy((handle), &(x), sizeof(x))
 static int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_counter *counter, unsigned int size,
-                             int nmi, int overflow)
+                             int nmi, int sample)
 {
        struct perf_mmap_data *data;
        unsigned int offset, head;
+        int have_lost;
+        struct {
+                struct perf_event_header header;
+                u64                      id;
+                u64                      lost;
+        } lost_event;
        /*
         * For inherited counters we send all the output towards the parent.
@@ -2288,19 +2372,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
        if (!data)
                goto out;
-        handle->data     = data;
+        handle->data    = data;
-        handle->counter  = counter;
+        handle->counter = counter;
-        handle->nmi      = nmi;
+        handle->nmi     = nmi;
-        handle->overflow = overflow;
+        handle->sample  = sample;
        if (!data->nr_pages)
                goto fail;
+        have_lost = atomic_read(&data->lost);
+        if (have_lost)
+                size += sizeof(lost_event);
        perf_output_lock(handle);
        do {
                offset = head = atomic_long_read(&data->head);
                head += size;
+                if (unlikely(!perf_output_space(data, offset, head)))
+                        goto fail;
        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
        handle->offset  = offset;
@@ -2309,55 +2399,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
        if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
                atomic_set(&data->wakeup, 1);
+        if (have_lost) {
+                lost_event.header.type = PERF_EVENT_LOST;
+                lost_event.header.misc = 0;
+                lost_event.header.size = sizeof(lost_event);
+                lost_event.id          = counter->id;
+                lost_event.lost        = atomic_xchg(&data->lost, 0);
+                perf_output_put(handle, lost_event);
+        }
        return 0;
 fail:
-        perf_output_wakeup(handle);
+        atomic_inc(&data->lost);
+        perf_output_unlock(handle);
 out:
        rcu_read_unlock();
        return -ENOSPC;
 }
-static void perf_output_copy(struct perf_output_handle *handle,
-                             const void *buf, unsigned int len)
-{
-        unsigned int pages_mask;
-        unsigned int offset;
-        unsigned int size;
-        void **pages;
-        offset          = handle->offset;
-        pages_mask      = handle->data->nr_pages - 1;
-        pages           = handle->data->data_pages;
-        do {
-                unsigned int page_offset;
-                int nr;
-                nr          = (offset >> PAGE_SHIFT) & pages_mask;
-                page_offset = offset & (PAGE_SIZE - 1);
-                size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-                memcpy(pages[nr] + page_offset, buf, size);
-                len         -= size;
-                buf         += size;
-                offset      += size;
-        } while (len);
-        handle->offset = offset;
-        /*
-         * Check we didn't copy past our reservation window, taking the
-         * possible unsigned int wrap into account.
-         */
-        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-#define perf_output_put(handle, x) \
-        perf_output_copy((handle), &(x), sizeof(x))
 static void perf_output_end(struct perf_output_handle *handle)
 {
        struct perf_counter *counter = handle->counter;
@@ -2365,7 +2427,7 @@ static void perf_output_end(struct perf_output_handle *handle)
        int wakeup_events = counter->attr.wakeup_events;
-        if (handle->overflow && wakeup_events) {
+        if (handle->sample && wakeup_events) {
                int events = atomic_inc_return(&data->events);
                if (events >= wakeup_events) {
                        atomic_sub(wakeup_events, &data->events);
@@ -2970,7 +3032,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 }
 /*
- * Generic counter overflow handling.
+ * Generic counter overflow handling, sampling.
 */
 int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3109,20 +3171,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 static void perf_swcounter_overflow(struct perf_counter *counter,
-                                    int nmi, struct pt_regs *regs, u64 addr)
+                                    int nmi, struct perf_sample_data *data)
 {
-        struct perf_sample_data data = {
+        data->period = counter->hw.last_period;
-                .regs   = regs,
-                .addr   = addr,
-                .period = counter->hw.last_period,
-        };
        perf_swcounter_update(counter);
        perf_swcounter_set_period(counter);
-        if (perf_counter_overflow(counter, nmi, &data))
+        if (perf_counter_overflow(counter, nmi, data))
                /* soft-disable the counter */
                ;
 }
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3187,18 +3244,18 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                               int nmi, struct pt_regs *regs, u64 addr)
+                               int nmi, struct perf_sample_data *data)
 {
        int neg = atomic64_add_negative(nr, &counter->hw.count);
-        if (counter->hw.sample_period && !neg && regs)
+        if (counter->hw.sample_period && !neg && data->regs)
-                perf_swcounter_overflow(counter, nmi, regs, addr);
+                perf_swcounter_overflow(counter, nmi, data);
 }
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-                                     enum perf_type_id type, u32 event,
+                                     enum perf_type_id type,
-                                     u64 nr, int nmi, struct pt_regs *regs,
+                                     u32 event, u64 nr, int nmi,
-                                     u64 addr)
+                                     struct perf_sample_data *data)
 {
        struct perf_counter *counter;
@@ -3207,8 +3264,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-                if (perf_swcounter_match(counter, type, event, regs))
+                if (perf_swcounter_match(counter, type, event, data->regs))
-                        perf_swcounter_add(counter, nr, nmi, regs, addr);
+                        perf_swcounter_add(counter, nr, nmi, data);
        }
        rcu_read_unlock();
 }
@@ -3227,9 +3284,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
        return &cpuctx->recursion[0];
 }
-static void __perf_swcounter_event(enum perf_type_id type, u32 event,
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
-                                   u64 nr, int nmi, struct pt_regs *regs,
+                                    u64 nr, int nmi,
-                                   u64 addr)
+                                    struct perf_sample_data *data)
 {
        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
        int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3299,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
        barrier();
        perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-                                 nr, nmi, regs, addr);
+                                 nr, nmi, data);
        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
@@ -3250,7 +3307,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
         */
        ctx = rcu_dereference(current->perf_counter_ctxp);
        if (ctx)
-                perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+                perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
        rcu_read_unlock();
        barrier();
@@ -3263,7 +3320,12 @@ out:
 void
 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-        __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
+        struct perf_sample_data data = {
+                .regs = regs,
+                .addr = addr,
+        };
+        do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
 }
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3404,36 +3466,18 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_counter_read,
 };
-/*
- * Software counter: cpu migrations
- */
-void perf_counter_task_migration(struct task_struct *task, int cpu)
-{
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-        struct perf_counter_context *ctx;
-        perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-                                 PERF_COUNT_SW_CPU_MIGRATIONS,
-                                 1, 1, NULL, 0);
-        ctx = perf_pin_task_context(task);
-        if (ctx) {
-                perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-                                         PERF_COUNT_SW_CPU_MIGRATIONS,
-                                         1, 1, NULL, 0);
-                perf_unpin_context(ctx);
-        }
-}
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-        struct pt_regs *regs = get_irq_regs();
+        struct perf_sample_data data = {
+                .regs = get_irq_regs();
+                .addr = 0,
+        };
-        if (!regs)
+        if (!data.regs)
-                regs = task_pt_regs(current);
+                data.regs = task_pt_regs(current);
-        __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
+        do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
diff --git a/kernel/sched.c b/kernel/sched.c
index 247fd0fedd0b..7c9098d186e6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-                perf_counter_task_migration(p, new_cpu);
+                perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+                                     1, 1, NULL, 0);
        }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
@@ -7822,7 +7823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                free_rootdomain(old_rd);
 }
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
        gfp_t gfp = GFP_KERNEL;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7deffc9f0e5f..e6c251790dde 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 *
 * Returns: -ENOMEM if memory fails.
 */
-int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp, bool bootmem)
 {
        gfp_t gfp = GFP_KERNEL;
        int i;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
                spread, rq0_min_vruntime, spread0;
-        struct rq *rq = &per_cpu(runqueues, cpu);
+        struct rq *rq = cpu_rq(cpu);
        struct sched_entity *last;
        unsigned long flags;
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        if (last)
                max_vruntime = last->vruntime;
        min_vruntime = cfs_rq->min_vruntime;
-        rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
        spin_unlock_irqrestore(&rq->lock, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                        SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 static void print_cpu(struct seq_file *m, int cpu)
 {
-        struct rq *rq = &per_cpu(runqueues, cpu);
+        struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_X86
        {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f9650e8fe75..ba7fd6e9556f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        for_each_sched_entity(se) {
                struct load_weight *load;
+                struct load_weight lw;
                cfs_rq = cfs_rq_of(se);
                load = &cfs_rq->load;
                if (unlikely(!se->on_rq)) {
-                        struct load_weight lw = cfs_rq->load;
+                        lw = cfs_rq->load;
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..98e02328c67d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_timer_migration,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
 #endif
        {
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "panic_on_io_nmi",
+                .data           = &panic_on_io_nmi,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = KERN_BOOTLOADER_TYPE,
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2aff39c6f10c..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
+        /*
+         * Call to tick_nohz_start_idle stops the last_update_time from being
+         * updated. Thus, it must not be called in the event we are called from
+         * irq_exit() with the prior state different than idle.
+         */
+        if (!inidle && !ts->inidle)
+                goto end;
        now = tick_nohz_start_idle(ts);
        /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
                goto end;
-        if (!inidle && !ts->inidle)
-                goto end;
        ts->inidle = 1;
        if (need_resched())
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
 /*
 * Collection status, active/inactive:
 */
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
 /*
 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        struct entry *entry, input;
        unsigned long flags;
-        if (likely(!active))
+        if (likely(!timer_stats_active))
                return;
        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        input.timer_flag = timer_flag;
        spin_lock_irqsave(lock, flags);
-        if (!active)
+        if (!timer_stats_active)
                goto out_unlock;
        entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
        /*
         * If still active then calculate up to now:
         */
-        if (active)
+        if (timer_stats_active)
                time_stop = ktime_get();
        time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
        mutex_lock(&show_mutex);
        switch (ctl[0]) {
        case '0':
-                if (active) {
+                if (timer_stats_active) {
-                        active = 0;
+                        timer_stats_active = 0;
                        time_stop = ktime_get();
                        sync_access();
                }
                break;
        case '1':
-                if (!active) {
+                if (!timer_stats_active) {
                        reset_entries();
                        time_start = ktime_get();
                        smp_mb();
-                        active = 1;
+                        timer_stats_active = 1;
                }
                break;
        default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..0b36b9e5cc8b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
 {
        unsigned int flag = 0;
+        if (likely(!timer->start_site))
+                return;
        if (unlikely(tbase_get_deferrable(timer->base)))
                flag |= TIMER_STATS_FLAG_DEFERRABLE;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61071fecc82e..1551f47e7669 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
 config HAVE_FUNCTION_GRAPH_TRACER
        bool
+config HAVE_FUNCTION_GRAPH_FP_TEST
+        bool
+        help
+         An arch may pass in a unique value (frame pointer) to both the
+         entering and exiting of a function. On exit, the value is compared
+         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
        help
@@ -121,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
        bool "Kernel Function Graph Tracer"
        depends on HAVE_FUNCTION_GRAPH_TRACER
        depends on FUNCTION_TRACER
+        depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
        default y
        help
          Enable the kernel to trace a function at both its return
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb60732ade0c..f3716bf04df6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
        pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
 again:
-        rec++;
+        if (idx != 0)
+                rec++;
        if ((void *)rec >= (void *)&pg->records[pg->index]) {
                pg = pg->next;
                if (!pg)
@@ -1224,6 +1226,13 @@ static void ftrace_shutdown(int command)
                return;
        ftrace_start_up--;
+        /*
+         * Just warn in case of unbalance, no need to kill ftrace, it's not
+         * critical but the ftrace_call callers may be never nopped again after
+         * further ftrace uses.
+         */
+        WARN_ON_ONCE(ftrace_start_up < 0);
        if (!ftrace_start_up)
                command |= FTRACE_DISABLE_CALLS;
@@ -1410,10 +1419,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
        void *p = NULL;
+        loff_t l;
+        if (!(iter->flags & FTRACE_ITER_HASH))
+                *pos = 0;
        iter->flags |= FTRACE_ITER_HASH;
-        return t_hash_next(m, p, pos);
+        iter->hidx = 0;
+        for (l = 0; l <= *pos; ) {
+                p = t_hash_next(m, p, &l);
+                if (!p)
+                        break;
+        }
+        return p;
 }
 static int t_hash_show(struct seq_file *m, void *v)
@@ -1460,8 +1479,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                        iter->pg = iter->pg->next;
                        iter->idx = 0;
                        goto retry;
-                } else {
-                        iter->idx = -1;
                }
        } else {
                rec = &iter->pg->records[iter->idx++];
@@ -1490,6 +1507,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
        void *p = NULL;
+        loff_t l;
        mutex_lock(&ftrace_lock);
        /*
@@ -1501,23 +1519,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
-                (*pos)++;
                return iter;
        }
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_start(m, pos);
-        if (*pos > 0) {
+        iter->pg = ftrace_pages_start;
-                if (iter->idx < 0)
+        iter->idx = 0;
-                        return p;
+        for (l = 0; l <= *pos; ) {
-                (*pos)--;
+                p = t_next(m, p, &l);
-                iter->idx--;
+                if (!p)
+                        break;
        }
-        p = t_next(m, p, pos);
+        if (!p && iter->flags & FTRACE_ITER_FILTER)
-        if (!p)
                return t_hash_start(m, pos);
        return p;
@@ -2493,32 +2509,31 @@ int ftrace_graph_count;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
-g_next(struct seq_file *m, void *v, loff_t *pos)
+__g_next(struct seq_file *m, loff_t *pos)
 {
        unsigned long *array = m->private;
-        int index = *pos;
-        (*pos)++;
-        if (index >= ftrace_graph_count)
+        if (*pos >= ftrace_graph_count)
                return NULL;
+        return &array[*pos];
+}
-        return &array[index];
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return __g_next(m, pos);
 }
 static void *g_start(struct seq_file *m, loff_t *pos)
 {
-        void *p = NULL;
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
        if (!ftrace_graph_count && !*pos)
                return (void *)1;
-        p = g_next(m, p, pos);
+        return __g_next(m, pos);
-        return p;
 }
 static void g_stop(struct seq_file *m, void *p)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 86cdf671d7e2..1edaa9516e81 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -186,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
        int cpu;
        kmemtrace_array = tr;
-        for_each_cpu_mask(cpu, cpu_possible_map)
+        for_each_cpu(cpu, cpu_possible_mask)
                tracing_reset(tr, cpu);
        kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc4dc70171ce..bf27bb7a63e2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT            4U
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -415,6 +416,8 @@ struct ring_buffer_per_cpu {
        unsigned long                   overrun;
        unsigned long                   read;
        local_t                         entries;
+        local_t                         committing;
+        local_t                         commits;
        u64                             write_stamp;
        u64                             read_stamp;
        atomic_t                        record_disabled;
@@ -618,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
        kfree(cpu_buffer);
 }
-/*
- * Causes compile errors if the struct buffer_page gets bigger
- * than the struct page.
- */
-extern int ring_buffer_page_too_big(void);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu);
@@ -646,11 +643,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        int bsize;
        int cpu;
-        /* Paranoid! Optimizes out when all is well */
-        if (sizeof(struct buffer_page) > sizeof(struct page))
-                ring_buffer_page_too_big();
        /* keep it in its own cache line */
        buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
                         GFP_KERNEL);
@@ -666,8 +658,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        buffer->reader_lock_key = key;
        /* need at least two pages */
-        if (buffer->pages == 1)
+        if (buffer->pages < 2)
-                buffer->pages++;
+                buffer->pages = 2;
        /*
         * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1011,12 +1003,12 @@ rb_event_index(struct ring_buffer_event *event)
 {
        unsigned long addr = (unsigned long)event;
-        return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+        return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
 static inline int
-rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-             struct ring_buffer_event *event)
+                   struct ring_buffer_event *event)
 {
        unsigned long addr = (unsigned long)event;
        unsigned long index;
@@ -1029,31 +1021,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 }
 static void
-rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
-                    struct ring_buffer_event *event)
-{
-        unsigned long addr = (unsigned long)event;
-        unsigned long index;
-        index = rb_event_index(event);
-        addr &= PAGE_MASK;
-        while (cpu_buffer->commit_page->page != (void *)addr) {
-                if (RB_WARN_ON(cpu_buffer,
-                          cpu_buffer->commit_page == cpu_buffer->tail_page))
-                        return;
-                cpu_buffer->commit_page->page->commit =
-                        cpu_buffer->commit_page->write;
-                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-                cpu_buffer->write_stamp =
-                        cpu_buffer->commit_page->page->time_stamp;
-        }
-        /* Now set the commit to the event's index */
-        local_set(&cpu_buffer->commit_page->page->commit, index);
-}
-static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
        /*
@@ -1171,6 +1138,60 @@ static unsigned rb_calculate_event_length(unsigned length)
        return length;
 }
+static inline void
+rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
+              struct buffer_page *tail_page,
+              unsigned long tail, unsigned long length)
+{
+        struct ring_buffer_event *event;
+        /*
+         * Only the event that crossed the page boundary
+         * must fill the old tail_page with padding.
+         */
+        if (tail >= BUF_PAGE_SIZE) {
+                local_sub(length, &tail_page->write);
+                return;
+        }
+        event = __rb_page_index(tail_page, tail);
+        kmemcheck_annotate_bitfield(event, bitfield);
+        /*
+         * If this event is bigger than the minimum size, then
+         * we need to be careful that we don't subtract the
+         * write counter enough to allow another writer to slip
+         * in on this page.
+         * We put in a discarded commit instead, to make sure
+         * that this space is not used again.
+         *
+         * If we are less than the minimum size, we don't need to
+         * worry about it.
+         */
+        if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+                /* No room for any events */
+                /* Mark the rest of the page with padding */
+                rb_event_set_padding(event);
+                /* Set the write back to the previous setting */
+                local_sub(length, &tail_page->write);
+                return;
+        }
+        /* Put in a discarded event */
+        event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+        event->type_len = RINGBUF_TYPE_PADDING;
+        /* time delta must be non zero */
+        event->time_delta = 1;
+        /* Account for this as an entry */
+        local_inc(&tail_page->entries);
+        local_inc(&cpu_buffer->entries);
+        /* Set write to end of buffer */
+        length = (tail + length) - BUF_PAGE_SIZE;
+        local_sub(length, &tail_page->write);
+}
 static struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1180,7 +1201,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 {
        struct buffer_page *next_page, *head_page, *reader_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
-        struct ring_buffer_event *event;
        bool lock_taken = false;
        unsigned long flags;
@@ -1265,27 +1285,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                cpu_buffer->tail_page->page->time_stamp = *ts;
        }
-        /*
+        rb_reset_tail(cpu_buffer, tail_page, tail, length);
-         * The actual tail page has moved forward.
-         */
-        if (tail < BUF_PAGE_SIZE) {
-                /* Mark the rest of the page with padding */
-                event = __rb_page_index(tail_page, tail);
-                kmemcheck_annotate_bitfield(event, bitfield);
-                rb_event_set_padding(event);
-        }
-        /* Set the write back to the previous setting */
-        local_sub(length, &tail_page->write);
-        /*
-         * If this was a commit entry that failed,
-         * increment that too
-         */
-        if (tail_page == cpu_buffer->commit_page &&
-            tail == rb_commit_index(cpu_buffer)) {
-                rb_set_commit_to_write(cpu_buffer);
-        }
        __raw_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
@@ -1295,7 +1295,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 out_reset:
        /* reset write */
-        local_sub(length, &tail_page->write);
+        rb_reset_tail(cpu_buffer, tail_page, tail, length);
        if (likely(lock_taken))
                __raw_spin_unlock(&cpu_buffer->lock);
@@ -1325,9 +1325,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        /* We reserved something on the buffer */
-        if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-                return NULL;
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
        rb_update_event(event, type, length);
@@ -1337,11 +1334,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                local_inc(&tail_page->entries);
        /*
-         * If this is a commit and the tail is zero, then update
+         * If this is the first commit on the page, then update
-         * this page's time stamp.
+         * its timestamp.
         */
-        if (!tail && rb_is_commit(cpu_buffer, event))
+        if (!tail)
-                cpu_buffer->commit_page->page->time_stamp = *ts;
+                tail_page->page->time_stamp = *ts;
        return event;
 }
@@ -1410,16 +1407,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                return -EAGAIN;
        /* Only a commited time event can update the write stamp */
-        if (rb_is_commit(cpu_buffer, event)) {
+        if (rb_event_is_commit(cpu_buffer, event)) {
                /*
-                 * If this is the first on the page, then we need to
+                 * If this is the first on the page, then it was
-                 * update the page itself, and just put in a zero.
+                 * updated with the page itself. Try to discard it
+                 * and if we can't just make it zero.
                 */
                if (rb_event_index(event)) {
                        event->time_delta = *delta & TS_MASK;
                        event->array[0] = *delta >> TS_SHIFT;
                } else {
-                        cpu_buffer->commit_page->page->time_stamp = *ts;
                        /* try to discard, since we do not need this */
                        if (!rb_try_to_discard(cpu_buffer, event)) {
                                /* nope, just zero it */
@@ -1445,6 +1442,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
        return ret;
 }
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        local_inc(&cpu_buffer->committing);
+        local_inc(&cpu_buffer->commits);
+}
+static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        unsigned long commits;
+        if (RB_WARN_ON(cpu_buffer,
+                       !local_read(&cpu_buffer->committing)))
+                return;
+ again:
+        commits = local_read(&cpu_buffer->commits);
+        /* synchronize with interrupts */
+        barrier();
+        if (local_read(&cpu_buffer->committing) == 1)
+                rb_set_commit_to_write(cpu_buffer);
+        local_dec(&cpu_buffer->committing);
+        /* synchronize with interrupts */
+        barrier();
+        /*
+         * Need to account for interrupts coming in between the
+         * updating of the commit page and the clearing of the
+         * committing counter.
+         */
+        if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+            !local_read(&cpu_buffer->committing)) {
+                local_inc(&cpu_buffer->committing);
+                goto again;
+        }
+}
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
                      unsigned long length)
@@ -1454,6 +1489,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
        int commit = 0;
        int nr_loops = 0;
+        rb_start_commit(cpu_buffer);
        length = rb_calculate_event_length(length);
 again:
        /*
@@ -1466,7 +1503,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         * Bail!
         */
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
-                return NULL;
+                goto out_fail;
        ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
@@ -1497,7 +1534,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
                        if (commit == -EBUSY)
-                                return NULL;
+                                goto out_fail;
                        if (commit == -EAGAIN)
                                goto again;
@@ -1511,30 +1548,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
-        if (!event) {
+        if (!event)
-                if (unlikely(commit))
+                goto out_fail;
-                        /*
-                         * Ouch! We needed a timestamp and it was commited. But
-                         * we didn't get our event reserved.
-                         */
-                        rb_set_commit_to_write(cpu_buffer);
-                return NULL;
-        }
-        /*
+        if (!rb_event_is_commit(cpu_buffer, event))
-         * If the timestamp was commited, make the commit our entry
-         * now so that we will update it when needed.
-         */
-        if (unlikely(commit))
-                rb_set_commit_event(cpu_buffer, event);
-        else if (!rb_is_commit(cpu_buffer, event))
                delta = 0;
        event->time_delta = delta;
        return event;
+ out_fail:
+        rb_end_commit(cpu_buffer);
+        return NULL;
 }
+#ifdef CONFIG_TRACING
 #define TRACE_RECURSIVE_DEPTH 16
 static int trace_recursive_lock(void)
@@ -1565,6 +1595,13 @@ static void trace_recursive_unlock(void)
        current->trace_recursion--;
 }
+#else
+#define trace_recursive_lock()          (0)
+#define trace_recursive_unlock()        do { } while (0)
+#endif
 static DEFINE_PER_CPU(int, rb_need_resched);
 /**
@@ -1642,13 +1679,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 {
        local_inc(&cpu_buffer->entries);
-        /* Only process further if we own the commit */
+        /*
-        if (!rb_is_commit(cpu_buffer, event))
+         * The event first in the commit queue updates the
-                return;
+         * time stamp.
+         */
-        cpu_buffer->write_stamp += event->time_delta;
+        if (rb_event_is_commit(cpu_buffer, event))
+                cpu_buffer->write_stamp += event->time_delta;
-        rb_set_commit_to_write(cpu_buffer);
+        rb_end_commit(cpu_buffer);
 }
 /**
@@ -1737,15 +1775,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
        /* The event is discarded regardless */
        rb_event_discard(event);
+        cpu = smp_processor_id();
+        cpu_buffer = buffer->buffers[cpu];
        /*
         * This must only be called if the event has not been
         * committed yet. Thus we can assume that preemption
         * is still disabled.
         */
-        RB_WARN_ON(buffer, preemptible());
+        RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
-        cpu = smp_processor_id();
-        cpu_buffer = buffer->buffers[cpu];
        if (!rb_try_to_discard(cpu_buffer, event))
                goto out;
@@ -1756,13 +1794,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
         */
        local_inc(&cpu_buffer->entries);
 out:
-        /*
+        rb_end_commit(cpu_buffer);
-         * If a write came in and pushed the tail page
-         * we still need to update the commit pointer
-         * if we were the commit.
-         */
-        if (rb_is_commit(cpu_buffer, event))
-                rb_set_commit_to_write(cpu_buffer);
        trace_recursive_unlock();
@@ -2446,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
+static inline int rb_ok_to_lock(void)
+{
+        /*
+         * If an NMI die dumps out the content of the ring buffer
+         * do not grab locks. We also permanently disable the ring
+         * buffer too. A one time deal is all you get from reading
+         * the ring buffer from an NMI.
+         */
+        if (likely(!in_nmi() && !oops_in_progress))
+                return 1;
+        tracing_off_permanent();
+        return 0;
+}
 /**
 * ring_buffer_peek - peek at the next event to be read
 * @buffer: The ring buffer to read
@@ -2461,14 +2508,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
        unsigned long flags;
+        int dolock;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return NULL;
+        dolock = rb_ok_to_lock();
 again:
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        local_irq_save(flags);
+        if (dolock)
+                spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(buffer, cpu, ts);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        if (dolock)
+                spin_unlock(&cpu_buffer->reader_lock);
+        local_irq_restore(flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
@@ -2520,6 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event = NULL;
        unsigned long flags;
+        int dolock;
+        dolock = rb_ok_to_lock();
 again:
        /* might be called in atomic */
@@ -2529,7 +2585,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
                goto out;
        cpu_buffer = buffer->buffers[cpu];
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        local_irq_save(flags);
+        if (dolock)
+                spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(buffer, cpu, ts);
        if (!event)
@@ -2538,7 +2596,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
        rb_advance_reader(cpu_buffer);
 out_unlock:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        if (dolock)
+                spin_unlock(&cpu_buffer->reader_lock);
+        local_irq_restore(flags);
 out:
        preempt_enable();
@@ -2680,6 +2740,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->overrun = 0;
        cpu_buffer->read = 0;
        local_set(&cpu_buffer->entries, 0);
+        local_set(&cpu_buffer->committing, 0);
+        local_set(&cpu_buffer->commits, 0);
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
@@ -2734,12 +2796,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
 int ring_buffer_empty(struct ring_buffer *buffer)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long flags;
+        int dolock;
        int cpu;
+        int ret;
+        dolock = rb_ok_to_lock();
        /* yes this is racy, but if you don't like the race, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-                if (!rb_per_cpu_empty(cpu_buffer))
+                local_irq_save(flags);
+                if (dolock)
+                        spin_lock(&cpu_buffer->reader_lock);
+                ret = rb_per_cpu_empty(cpu_buffer);
+                if (dolock)
+                        spin_unlock(&cpu_buffer->reader_lock);
+                local_irq_restore(flags);
+                if (!ret)
                        return 0;
        }
@@ -2755,14 +2830,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long flags;
+        int dolock;
        int ret;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 1;
+        dolock = rb_ok_to_lock();
        cpu_buffer = buffer->buffers[cpu];
+        local_irq_save(flags);
+        if (dolock)
+                spin_lock(&cpu_buffer->reader_lock);
        ret = rb_per_cpu_empty(cpu_buffer);
+        if (dolock)
+                spin_unlock(&cpu_buffer->reader_lock);
+        local_irq_restore(flags);
        return ret;
 }
@@ -3029,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#ifdef CONFIG_TRACING
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
@@ -3096,6 +3181,7 @@ static __init int rb_init_debugfs(void)
 }
 fs_initcall(rb_init_debugfs);
+#endif
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
@@ -3108,7 +3194,7 @@ static int rb_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (cpu_isset(cpu, *buffer->cpumask))
+                if (cpumask_test_cpu(cpu, buffer->cpumask))
                        return NOTIFY_OK;
                buffer->buffers[cpu] =
@@ -3119,7 +3205,7 @@ static int rb_cpu_notify(struct notifier_block *self,
                        return NOTIFY_OK;
                }
                smp_wmb();
-                cpu_set(cpu, *buffer->cpumask);
+                cpumask_set_cpu(cpu, buffer->cpumask);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8d68e149a8b3..573d3cc762c3 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -102,8 +102,10 @@ static enum event_status read_page(int cpu)
                        event = (void *)&rpage->data[i];
                        switch (event->type_len) {
                        case RINGBUF_TYPE_PADDING:
-                                /* We don't expect any padding */
+                                /* failed writes may be discarded events */
-                                KILL_TEST();
+                                if (!event->time_delta)
+                                        KILL_TEST();
+                                inc = event->array[0] + 4;
                                break;
                        case RINGBUF_TYPE_TIME_EXTEND:
                                inc = 8;
@@ -119,7 +121,7 @@ static enum event_status read_page(int cpu)
                                        KILL_TEST();
                                        break;
                                }
-                                inc = event->array[0];
+                                inc = event->array[0] + 4;
                                break;
                        default:
                                entry = ring_buffer_event_data(event);
@@ -201,7 +203,7 @@ static void ring_buffer_producer(void)
         * Hammer the buffer for 10 secs (this may
         * make the system stall)
         */
-        pr_info("Starting ring buffer hammer\n");
+        trace_printk("Starting ring buffer hammer\n");
        do_gettimeofday(&start_tv);
        do {
                struct ring_buffer_event *event;
@@ -237,7 +239,7 @@ static void ring_buffer_producer(void)
 #endif
        } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
-        pr_info("End ring buffer hammer\n");
+        trace_printk("End ring buffer hammer\n");
        if (consumer) {
                /* Init both completions here to avoid races */
@@ -260,49 +262,50 @@ static void ring_buffer_producer(void)
        overruns = ring_buffer_overruns(buffer);
        if (kill_test)
-                pr_info("ERROR!\n");
+                trace_printk("ERROR!\n");
-        pr_info("Time:     %lld (usecs)\n", time);
+        trace_printk("Time:     %lld (usecs)\n", time);
-        pr_info("Overruns: %lld\n", overruns);
+        trace_printk("Overruns: %lld\n", overruns);
        if (disable_reader)
-                pr_info("Read:     (reader disabled)\n");
+                trace_printk("Read:     (reader disabled)\n");
        else
-                pr_info("Read:     %ld  (by %s)\n", read,
+                trace_printk("Read:     %ld  (by %s)\n", read,
                        read_events ? "events" : "pages");
-        pr_info("Entries:  %lld\n", entries);
+        trace_printk("Entries:  %lld\n", entries);
-        pr_info("Total:    %lld\n", entries + overruns + read);
+        trace_printk("Total:    %lld\n", entries + overruns + read);
-        pr_info("Missed:   %ld\n", missed);
+        trace_printk("Missed:   %ld\n", missed);
-        pr_info("Hit:      %ld\n", hit);
+        trace_printk("Hit:      %ld\n", hit);
        /* Convert time from usecs to millisecs */
        do_div(time, USEC_PER_MSEC);
        if (time)
                hit /= (long)time;
        else
-                pr_info("TIME IS ZERO??\n");
+                trace_printk("TIME IS ZERO??\n");
-        pr_info("Entries per millisec: %ld\n", hit);
+        trace_printk("Entries per millisec: %ld\n", hit);
        if (hit) {
                /* Calculate the average time in nanosecs */
                avg = NSEC_PER_MSEC / hit;
-                pr_info("%ld ns per entry\n", avg);
+                trace_printk("%ld ns per entry\n", avg);
        }
        if (missed) {
                if (time)
                        missed /= (long)time;
-                pr_info("Total iterations per millisec: %ld\n", hit + missed);
+                trace_printk("Total iterations per millisec: %ld\n",
+                             hit + missed);
                /* it is possible that hit + missed will overflow and be zero */
                if (!(hit + missed)) {
-                        pr_info("hit + missed overflowed and totalled zero!\n");
+                        trace_printk("hit + missed overflowed and totalled zero!\n");
                        hit--; /* make it non zero */
                }
                /* Caculate the average time in nanosecs */
                avg = NSEC_PER_MSEC / (hit + missed);
-                pr_info("%ld ns per entry\n", avg);
+                trace_printk("%ld ns per entry\n", avg);
        }
 }
@@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg)
                ring_buffer_producer();
-                pr_info("Sleeping for 10 secs\n");
+                trace_printk("Sleeping for 10 secs\n");
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(HZ * SLEEP_TIME);
                __set_current_state(TASK_RUNNING);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c1878bfb2e1e..3aa0a0dfdfa8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -284,13 +284,12 @@ void trace_wake_up(void)
 static int __init set_buf_size(char *str)
 {
        unsigned long buf_size;
-        int ret;
        if (!str)
                return 0;
-        ret = strict_strtoul(str, 0, &buf_size);
+        buf_size = memparse(str, &str);
        /* nr_entries can not be zero */
-        if (ret < 0 || buf_size == 0)
+        if (buf_size == 0)
                return 0;
        trace_buf_size = buf_size;
        return 1;
@@ -2053,25 +2052,23 @@ static int tracing_open(struct inode *inode, struct file *file)
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct tracer *t = m->private;
+        struct tracer *t = v;
        (*pos)++;
        if (t)
                t = t->next;
-        m->private = t;
        return t;
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        struct tracer *t = m->private;
+        struct tracer *t;
        loff_t l = 0;
        mutex_lock(&trace_types_lock);
-        for (; t && l < *pos; t = t_next(m, t, &l))
+        for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
                ;
        return t;
@@ -2107,18 +2104,10 @@ static struct seq_operations show_traces_seq_ops = {
 static int show_traces_open(struct inode *inode, struct file *file)
 {
-        int ret;
        if (tracing_disabled)
                return -ENODEV;
-        ret = seq_open(file, &show_traces_seq_ops);
+        return seq_open(file, &show_traces_seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = trace_types;
-        }
-        return ret;
 }
 static ssize_t
@@ -2191,11 +2180,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
                return -ENOMEM;
-        mutex_lock(&tracing_cpumask_update_lock);
        err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
        if (err)
                goto err_unlock;
+        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
        __raw_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
@@ -2223,8 +2213,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        return count;
 err_unlock:
-        mutex_unlock(&tracing_cpumask_update_lock);
+        free_cpumask_var(tracing_cpumask_new);
-        free_cpumask_var(tracing_cpumask);
        return err;
 }
@@ -3626,7 +3615,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        struct trace_seq *s;
        unsigned long cnt;
-        s = kmalloc(sizeof(*s), GFP_ATOMIC);
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
                return ENOMEM;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..3548ae5cc780 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter)
 extern struct pid *ftrace_pid_trace;
+#ifdef CONFIG_FUNCTION_TRACER
 static inline int ftrace_trace_task(struct task_struct *task)
 {
        if (!ftrace_pid_trace)
@@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
        return test_tsk_trace_trace(task);
 }
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+        return 1;
+}
+#endif
 /*
 * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..53c8fd376a88 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+        struct ftrace_event_call *call = NULL;
+        loff_t l;
        mutex_lock(&event_mutex);
-        if (*pos == 0)
-                m->private = ftrace_events.next;
+        m->private = ftrace_events.next;
-        return t_next(m, NULL, pos);
+        for (l = 0; l <= *pos; ) {
+                call = t_next(m, NULL, &l);
+                if (!call)
+                        break;
+        }
+        return call;
 }
 static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
+        struct ftrace_event_call *call = NULL;
+        loff_t l;
        mutex_lock(&event_mutex);
-        if (*pos == 0)
-                m->private = ftrace_events.next;
+        m->private = ftrace_events.next;
-        return s_next(m, NULL, pos);
+        for (l = 0; l <= *pos; ) {
+                call = s_next(m, NULL, &l);
+                if (!call)
+                        break;
+        }
+        return call;
 }
 static int t_show(struct seq_file *m, void *v)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index db6e54bdb596..936c621bbf46 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,8 +27,6 @@
 #include "trace.h"
 #include "trace_output.h"
-static DEFINE_MUTEX(filter_mutex);
 enum filter_op_ids
 {
        OP_OR,
@@ -178,7 +176,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 static int filter_pred_strloc(struct filter_pred *pred, void *event,
                              int val1, int val2)
 {
-        int str_loc = *(int *)(event + pred->offset);
+        unsigned short str_loc = *(unsigned short *)(event + pred->offset);
        char *addr = (char *)(event + str_loc);
        int cmp, match;
@@ -294,12 +292,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
        struct event_filter *filter = call->filter;
-        mutex_lock(&filter_mutex);
+        mutex_lock(&event_mutex);
        if (filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
                trace_seq_printf(s, "none\n");
-        mutex_unlock(&filter_mutex);
+        mutex_unlock(&event_mutex);
 }
 void print_subsystem_event_filter(struct event_subsystem *system,
@@ -307,12 +305,12 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 {
        struct event_filter *filter = system->filter;
-        mutex_lock(&filter_mutex);
+        mutex_lock(&event_mutex);
        if (filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
                trace_seq_printf(s, "none\n");
-        mutex_unlock(&filter_mutex);
+        mutex_unlock(&event_mutex);
 }
 static struct ftrace_event_field *
@@ -381,6 +379,7 @@ void destroy_preds(struct ftrace_event_call *call)
                        filter_free_pred(filter->preds[i]);
        }
        kfree(filter->preds);
+        kfree(filter->filter_string);
        kfree(filter);
        call->filter = NULL;
 }
@@ -433,7 +432,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
                filter->n_preds = 0;
        }
-        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
                if (!call->define_fields)
                        continue;
@@ -443,7 +441,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
                        remove_filter_string(call->filter);
                }
        }
-        mutex_unlock(&event_mutex);
 }
 static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -546,6 +543,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
        filter_pred_fn_t fn;
        unsigned long long val;
        int string_type;
+        int ret;
        pred->fn = filter_pred_none;
@@ -581,7 +579,11 @@ static int filter_add_pred(struct filter_parse_state *ps,
                        pred->not = 1;
                return filter_add_pred_fn(ps, call, pred, fn);
        } else {
-                if (strict_strtoull(pred->str_val, 0, &val)) {
+                if (field->is_signed)
+                        ret = strict_strtoll(pred->str_val, 0, &val);
+                else
+                        ret = strict_strtoull(pred->str_val, 0, &val);
+                if (ret) {
                        parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
                        return -EINVAL;
                }
@@ -625,7 +627,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
        filter->preds[filter->n_preds] = pred;
        filter->n_preds++;
-        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
                if (!call->define_fields)
@@ -636,14 +637,12 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
                err = filter_add_pred(ps, call, pred);
                if (err) {
-                        mutex_unlock(&event_mutex);
                        filter_free_subsystem_preds(system);
                        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
                        goto out;
                }
                replace_filter_string(call->filter, filter_string);
        }
-        mutex_unlock(&event_mutex);
 out:
        return err;
 }
@@ -1070,12 +1069,12 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        struct filter_parse_state *ps;
-        mutex_lock(&filter_mutex);
+        mutex_lock(&event_mutex);
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_disable_preds(call);
                remove_filter_string(call->filter);
-                mutex_unlock(&filter_mutex);
+                mutex_unlock(&event_mutex);
                return 0;
        }
@@ -1103,7 +1102,7 @@ out:
        postfix_clear(ps);
        kfree(ps);
 out_unlock:
-        mutex_unlock(&filter_mutex);
+        mutex_unlock(&event_mutex);
        return err;
 }
@@ -1115,12 +1114,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        struct filter_parse_state *ps;
-        mutex_lock(&filter_mutex);
+        mutex_lock(&event_mutex);
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
-                mutex_unlock(&filter_mutex);
+                mutex_unlock(&event_mutex);
                return 0;
        }
@@ -1148,7 +1147,7 @@ out:
        postfix_clear(ps);
        kfree(ps);
 out_unlock:
-        mutex_unlock(&filter_mutex);
+        mutex_unlock(&event_mutex);
        return err;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..7402144bff21 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
 static void tracing_stop_function_trace(void)
 {
        ftrace_function_enabled = 0;
-        /* OK if they are not registered */
-        unregister_ftrace_function(&trace_stack_ops);
+        if (func_flags.val & TRACE_FUNC_OPT_STACK)
-        unregister_ftrace_function(&trace_ops);
+                unregister_ftrace_function(&trace_stack_ops);
+        else
+                unregister_ftrace_function(&trace_ops);
 }
 static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
        if (count == -1)
                seq_printf(m, ":unlimited\n");
        else
-                seq_printf(m, ":count=%ld", count);
+                seq_printf(m, ":count=%ld\n", count);
-        seq_putc(m, '\n');
        return 0;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b2..d2249abafb53 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+                         unsigned long frame_pointer)
 {
        unsigned long long calltime;
        int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
        current->ret_stack[index].subtime = 0;
+        current->ret_stack[index].fp = frame_pointer;
        *depth = index;
        return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 /* Retrieve a function return address to the trace stack on thread info.*/
 static void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+                        unsigned long frame_pointer)
 {
        int index;
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
                return;
        }
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+        /*
+         * The arch may choose to record the frame pointer used
+         * and check it here to make sure that it is what we expect it
+         * to be. If gcc does not set the place holder of the return
+         * address in the frame pointer, and does a copy instead, then
+         * the function graph trace will fail. This test detects this
+         * case.
+         *
+         * Currently, x86_32 with optimize for size (-Os) makes the latest
+         * gcc do the above.
+         */
+        if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+                ftrace_graph_stop();
+                WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+                     "  from func %pF return to %lx\n",
+                     current->ret_stack[index].fp,
+                     frame_pointer,
+                     (void *)current->ret_stack[index].func,
+                     current->ret_stack[index].ret);
+                *ret = (unsigned long)panic;
+                return;
+        }
+#endif
        *ret = current->ret_stack[index].ret;
        trace->func = current->ret_stack[index].func;
        trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 * Send the trace to the ring-buffer.
 * @return the original return address.
 */
-unsigned long ftrace_return_to_handler(void)
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 {
        struct ftrace_graph_ret trace;
        unsigned long ret;
-        ftrace_pop_return_trace(&trace, &ret);
+        ftrace_pop_return_trace(&trace, &ret, frame_pointer);
        trace.rettime = trace_clock_local();
        ftrace_graph_return(&trace);
        barrier();
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..7b6278110827 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_start(struct seq_file *m, loff_t *pos)
 {
-        const char **fmt = m->private;
+        const char **fmt = __start___trace_bprintk_fmt + *pos;
-        const char **next = fmt;
-        (*pos)++;
        if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
                return NULL;
-        next = fmt;
-        m->private = ++next;
        return fmt;
 }
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
 {
-        return t_next(m, NULL, pos);
+        (*pos)++;
+        return t_start(m, pos);
 }
 static int t_show(struct seq_file *m, void *v)
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
 static int
 ftrace_formats_open(struct inode *inode, struct file *file)
 {
-        int ret;
+        return seq_open(file, &show_format_seq_ops);
-        ret = seq_open(file, &show_format_seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = __start___trace_bprintk_fmt;
-        }
-        return ret;
 }
 static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..e66f5e493342 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
        mutex_lock(&session->stat_mutex);
        /* If we are in the beginning of the file, print the headers */
-        if (!*pos && session->ts->stat_headers) {
+        if (!*pos && session->ts->stat_headers)
-                (*pos)++;
                return SEQ_START_TOKEN;
-        }
        node = rb_first(&session->stat_root);
        for (i = 0; node && i < *pos; i++)
                node = rb_next(node);
-        (*pos)++;
        return node;
 }
author	James Morris <jmorris@namei.org>	2009-06-29 19:10:35 -0400
committer	James Morris <jmorris@namei.org>	2009-06-29 19:10:35 -0400
commit	ac7242142b03421c96b0a2f8d99f146d075614c2 (patch)
tree	b0b2ead65858c7a343d38affed86fe815e37e7e9 /kernel
parent	89c86576ecde504da1eeb4f4882b2189ac2f9c4a (diff)
parent	2bfdd79eaa0043346e773ba5f6cfd811ea31b73d (diff)