20 files changed, 1065 insertions, 712 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0630e293cd49..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 static int acct_on(char *name)
 {
        struct file *file;
+        struct vfsmount *mnt;
        int error;
        struct pid_namespace *ns;
        struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
                acct = NULL;
        }
-        mnt_pin(file->f_path.mnt);
+        mnt = file->f_path.mnt;
+        mnt_pin(mnt);
        acct_file_reopen(ns->bacct, file, ns);
        spin_unlock(&acct_lock);
-        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+        mntput(mnt); /* it's pinned, now give up active reference */
        kfree(acct);
        return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 /* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
                kfree_skb(skb);
 }
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+        struct nlmsghdr *nlh = nlmsg_hdr(skb);
+        char *data = NLMSG_DATA(nlh);
+        if (nlh->nlmsg_type != AUDIT_EOE) {
+                if (printk_ratelimit())
+                        printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+                else
+                        audit_log_lost("printk limit exceeded\n");
+        }
+        audit_hold_skb(skb);
+}
 static void kauditd_send_skb(struct sk_buff *skb)
 {
        int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
                if (skb) {
                        if (audit_pid)
                                kauditd_send_skb(skb);
-                        else {
+                        else
-                                if (printk_ratelimit())
+                                audit_printk_skb(skb);
-                                        printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
-                                else
-                                        audit_log_lost("printk limit exceeded\n");
-                                audit_hold_skb(skb);
-                        }
                } else {
                        DECLARE_WAITQUEUE(wait, current);
                        set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
        return 0;
 }
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
-        mutex_lock(&audit_cmd_mutex);
-        audit_prune_trees();
-        mutex_unlock(&audit_cmd_mutex);
-        return 0;
-}
-void audit_schedule_prune(void)
-{
-        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
                                 int multi, void *payload, int size)
 {
        struct sk_buff  *skb;
        struct nlmsghdr *nlh;
-        int             len = NLMSG_SPACE(size);
        void            *data;
        int             flags = multi ? NLM_F_MULTI : 0;
        int             t     = done  ? NLMSG_DONE  : type;
-        skb = alloc_skb(len, GFP_KERNEL);
+        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return NULL;
-        nlh              = NLMSG_PUT(skb, pid, seq, t, size);
+        nlh     = NLMSG_NEW(skb, pid, seq, t, size, flags);
-        nlh->nlmsg_flags = flags;
+        data    = NLMSG_DATA(nlh);
-        data             = NLMSG_DATA(nlh);
        memcpy(data, payload, size);
        return skb;
-nlmsg_failure:                  /* Used by NLMSG_PUT */
+nlmsg_failure:                  /* Used by NLMSG_NEW */
        if (skb)
                kfree_skb(skb);
        return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 }
 /*
- * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * Get message from skb.  Each message is processed by audit_receive_msg.
- * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * Malformed skbs with wrong length are discarded silently.
- * discarded silently.
 */
 static void audit_receive_skb(struct sk_buff *skb)
 {
-        int             err;
+        struct nlmsghdr *nlh;
-        struct nlmsghdr *nlh;
+        /*
-        u32             rlen;
+         * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+         * if the nlmsg_len was not aligned
+         */
+        int len;
+        int err;
-        while (skb->len >= NLMSG_SPACE(0)) {
+        nlh = nlmsg_hdr(skb);
-                nlh = nlmsg_hdr(skb);
+        len = skb->len;
-                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-                        return;
+        while (NLMSG_OK(nlh, len)) {
-                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                err = audit_receive_msg(skb, nlh);
-                if (rlen > skb->len)
+                /* if err or if this message says it wants a response */
-                        rlen = skb->len;
+                if (err || (nlh->nlmsg_flags & NLM_F_ACK))
-                if ((err = audit_receive_msg(skb, nlh))) {
                        netlink_ack(skb, nlh, err);
-                } else if (nlh->nlmsg_flags & NLM_F_ACK)
-                        netlink_ack(skb, nlh, 0);
+                nlh = NLMSG_NEXT(nlh, len);
-                skb_pull(skb, rlen);
        }
 }
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff  *skb)
        mutex_unlock(&audit_cmd_mutex);
 }
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
-        .handle_event   = audit_handle_ievent,
-        .destroy_watch  = audit_free_parent,
-};
-#endif
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
@@ -991,12 +978,6 @@ static int __init audit_init(void)
        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-#ifdef CONFIG_AUDITSYSCALL
-        audit_ih = inotify_init(&audit_inotify_ops);
-        if (IS_ERR(audit_ih))
-                audit_panic("cannot initialize inotify handle");
-#endif
        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
                        goto err;
        }
-        ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
-        if (!ab->skb)
-                goto err;
        ab->ctx = ctx;
        ab->gfp_mask = gfp_mask;
-        nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-        nlh->nlmsg_type = type;
+        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
-        nlh->nlmsg_flags = 0;
+        if (!ab->skb)
-        nlh->nlmsg_pid = 0;
+                goto nlmsg_failure;
-        nlh->nlmsg_seq = 0;
+        nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
        return ab;
+nlmsg_failure:                  /* Used by NLMSG_NEW */
+        kfree_skb(ab->skb);
+        ab->skb = NULL;
 err:
        audit_buffer_free(ab);
        return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        kfree(pathname);
 }
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+        audit_log_format(ab, " key=");
+        if (key)
+                audit_log_untrustedstring(ab, key);
+        else
+                audit_log_format(ab, "(null)");
+}
 /**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
                        skb_queue_tail(&audit_skb_queue, ab->skb);
                        wake_up_interruptible(&kauditd_wait);
                } else {
-                        if (nlh->nlmsg_type != AUDIT_EOE) {
+                        audit_printk_skb(ab->skb);
-                                if (printk_ratelimit()) {
-                                        printk(KERN_NOTICE "type=%d %s\n",
-                                                nlh->nlmsg_type,
-                                                ab->skb->data + NLMSG_SPACE(0));
-                                } else
-                                        audit_log_lost("printk limit exceeded\n");
-                        }
-                        audit_hold_skb(ab->skb);
                }
                ab->skb = NULL;
        }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
 };
 /* Rule lists */
-struct audit_parent;
+struct audit_watch;
-struct audit_watch {
-        atomic_t                count;  /* reference count */
-        char                    *path;  /* insertion path */
-        dev_t                   dev;    /* associated superblock device */
-        unsigned long           ino;    /* associated inode number */
-        struct audit_parent     *parent; /* associated parent */
-        struct list_head        wlist;  /* entry in parent->watches list */
-        struct list_head        rules;  /* associated rules */
-};
 struct audit_tree;
 struct audit_chunk;
@@ -108,19 +97,28 @@ struct audit_netlink_list {
 int audit_send_list(void *);
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
-                                const char *, struct inode *);
 extern int selinux_audit_rule_update(void);
 extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
+/* audit watch functions */
+extern unsigned long audit_watch_inode(struct audit_watch *watch);
+extern dev_t audit_watch_dev(struct audit_watch *watch);
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule);
+extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
+extern void audit_inotify_unregister(struct list_head *in_list);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+                                           struct audit_watch *watch);
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
 extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
 extern int audit_remove_tree_rule(struct audit_krule *);
 extern void audit_trim_trees(void);
 extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
 extern const char *audit_tree_path(struct audit_tree *);
 extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
 #else
 #define audit_remove_tree_rule(rule) BUG()
 #define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
 #define audit_put_tree(tree) (void)0
 #define audit_tag_tree(old, new) -EINVAL
 #define audit_tree_path(rule) ""        /* never called */
+#define audit_kill_trees(list) BUG()
 #endif
 extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
        return 0;
 }
 extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
 #endif
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
 #include <linux/inotify.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/kthread.h>
 struct audit_tree;
 struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
                if (rule->tree) {
                        /* not a half-baked one */
                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "op=remove rule dir=");
+                        audit_log_format(ab, "op=");
+                        audit_log_string(ab, "remove rule");
+                        audit_log_format(ab, " dir=");
                        audit_log_untrustedstring(ab, rule->tree->pathname);
-                        if (rule->filterkey) {
+                        audit_log_key(ab, rule->filterkey);
-                                audit_log_format(ab, " key=");
-                                audit_log_untrustedstring(ab, rule->filterkey);
-                        } else
-                                audit_log_format(ab, " key=(null)");
                        audit_log_format(ab, " list=%d res=1", rule->listnr);
                        audit_log_end(ab);
                        rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
        }
 }
+static void audit_schedule_prune(void);
 /* called with audit_filter_mutex */
 int audit_remove_tree_rule(struct audit_krule *rule)
 {
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
 /*
 * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
 */
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
 {
+        mutex_lock(&audit_cmd_mutex);
        mutex_lock(&audit_filter_mutex);
        while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
        }
        mutex_unlock(&audit_filter_mutex);
+        mutex_unlock(&audit_cmd_mutex);
+        return 0;
+}
+static void audit_schedule_prune(void)
+{
+        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall.  Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+        mutex_lock(&audit_cmd_mutex);
+        mutex_lock(&audit_filter_mutex);
+        while (!list_empty(list)) {
+                struct audit_tree *victim;
+                victim = list_entry(list->next, struct audit_tree, list);
+                kill_rules(victim);
+                list_del_init(&victim->list);
+                mutex_unlock(&audit_filter_mutex);
+                prune_one(victim);
+                mutex_lock(&audit_filter_mutex);
+        }
+        mutex_unlock(&audit_filter_mutex);
+        mutex_unlock(&audit_cmd_mutex);
 }
 /*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
 static void evict_chunk(struct audit_chunk *chunk)
 {
        struct audit_tree *owner;
+        struct list_head *postponed = audit_killed_trees();
+        int need_prune = 0;
        int n;
        if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
                owner->root = NULL;
                list_del_init(&owner->same_root);
                spin_unlock(&hash_lock);
-                kill_rules(owner);
+                if (!postponed) {
-                list_move(&owner->list, &prune_list);
+                        kill_rules(owner);
-                audit_schedule_prune();
+                        list_move(&owner->list, &prune_list);
+                        need_prune = 1;
+                } else {
+                        list_move(&owner->list, postponed);
+                }
                spin_lock(&hash_lock);
        }
        list_del_rcu(&chunk->hash);
        for (n = 0; n < chunk->count; n++)
                list_del_init(&chunk->owners[n].list);
        spin_unlock(&hash_lock);
+        if (need_prune)
+                audit_schedule_prune();
        mutex_unlock(&audit_filter_mutex);
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ *      event.  Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ *      audit_remove_watch().  Additionally, an audit_watch may exist
+ *      temporarily to assist in searching existing filter data.  Each
+ *      audit_krule holds a reference to its associated watch.
+ */
+struct audit_watch {
+        atomic_t                count;  /* reference count */
+        char                    *path;  /* insertion path */
+        dev_t                   dev;    /* associated superblock device */
+        unsigned long           ino;    /* associated inode number */
+        struct audit_parent     *parent; /* associated parent */
+        struct list_head        wlist;  /* entry in parent->watches list */
+        struct list_head        rules;  /* associated rules */
+};
+struct audit_parent {
+        struct list_head        ilist;  /* entry in inotify registration list */
+        struct list_head        watches; /* associated watches */
+        struct inotify_watch    wdata;  /* inotify watch data */
+        unsigned                flags;  /* status flags */
+};
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID    0x001
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct inotify_watch *i_watch)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        WARN_ON(!list_empty(&parent->watches));
+        kfree(parent);
+}
+void audit_get_watch(struct audit_watch *watch)
+{
+        atomic_inc(&watch->count);
+}
+void audit_put_watch(struct audit_watch *watch)
+{
+        if (atomic_dec_and_test(&watch->count)) {
+                WARN_ON(watch->parent);
+                WARN_ON(!list_empty(&watch->rules));
+                kfree(watch->path);
+                kfree(watch);
+        }
+}
+void audit_remove_watch(struct audit_watch *watch)
+{
+        list_del(&watch->wlist);
+        put_inotify_watch(&watch->parent->wdata);
+        watch->parent = NULL;
+        audit_put_watch(watch); /* match initial get */
+}
+char *audit_watch_path(struct audit_watch *watch)
+{
+        return watch->path;
+}
+struct list_head *audit_watch_rules(struct audit_watch *watch)
+{
+        return &watch->rules;
+}
+unsigned long audit_watch_inode(struct audit_watch *watch)
+{
+        return watch->ino;
+}
+dev_t audit_watch_dev(struct audit_watch *watch)
+{
+        return watch->dev;
+}
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+        struct audit_parent *parent;
+        s32 wd;
+        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+        if (unlikely(!parent))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&parent->watches);
+        parent->flags = 0;
+        inotify_init_watch(&parent->wdata);
+        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+        get_inotify_watch(&parent->wdata);
+        wd = inotify_add_watch(audit_ih, &parent->wdata,
+                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+        if (wd < 0) {
+                audit_free_parent(&parent->wdata);
+                return ERR_PTR(wd);
+        }
+        return parent;
+}
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+        struct audit_watch *watch;
+        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+        if (unlikely(!watch))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&watch->rules);
+        atomic_set(&watch->count, 1);
+        watch->path = path;
+        watch->dev = (dev_t)-1;
+        watch->ino = (unsigned long)-1;
+        return watch;
+}
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+        struct audit_watch *watch;
+        if (!audit_ih)
+                return -EOPNOTSUPP;
+        if (path[0] != '/' || path[len-1] == '/' ||
+            krule->listnr != AUDIT_FILTER_EXIT ||
+            op != Audit_equal ||
+            krule->inode_f || krule->watch || krule->tree)
+                return -EINVAL;
+        watch = audit_init_watch(path);
+        if (IS_ERR(watch))
+                return PTR_ERR(watch);
+        audit_get_watch(watch);
+        krule->watch = watch;
+        return 0;
+}
+/* Duplicate the given audit watch.  The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+        char *path;
+        struct audit_watch *new;
+        path = kstrdup(old->path, GFP_KERNEL);
+        if (unlikely(!path))
+                return ERR_PTR(-ENOMEM);
+        new = audit_init_watch(path);
+        if (IS_ERR(new)) {
+                kfree(path);
+                goto out;
+        }
+        new->dev = old->dev;
+        new->ino = old->ino;
+        get_inotify_watch(&old->parent->wdata);
+        new->parent = old->parent;
+out:
+        return new;
+}
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+        if (audit_enabled) {
+                struct audit_buffer *ab;
+                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+                audit_log_format(ab, "auid=%u ses=%u op=",
+                                 audit_get_loginuid(current),
+                                 audit_get_sessionid(current));
+                audit_log_string(ab, op);
+                audit_log_format(ab, " path=");
+                audit_log_untrustedstring(ab, w->path);
+                audit_log_key(ab, r->filterkey);
+                audit_log_format(ab, " list=%d res=1", r->listnr);
+                audit_log_end(ab);
+        }
+}
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+                               const char *dname, dev_t dev,
+                               unsigned long ino, unsigned invalidating)
+{
+        struct audit_watch *owatch, *nwatch, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *oentry, *nentry;
+        mutex_lock(&audit_filter_mutex);
+        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+                if (audit_compare_dname_path(dname, owatch->path, NULL))
+                        continue;
+                /* If the update involves invalidating rules, do the inode-based
+                 * filtering now, so we don't omit records. */
+                if (invalidating && current->audit_context)
+                        audit_filter_inodes(current, current->audit_context);
+                nwatch = audit_dupe_watch(owatch);
+                if (IS_ERR(nwatch)) {
+                        mutex_unlock(&audit_filter_mutex);
+                        audit_panic("error updating watch, skipping");
+                        return;
+                }
+                nwatch->dev = dev;
+                nwatch->ino = ino;
+                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+                        oentry = container_of(r, struct audit_entry, rule);
+                        list_del(&oentry->rule.rlist);
+                        list_del_rcu(&oentry->list);
+                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
+                        if (IS_ERR(nentry)) {
+                                list_del(&oentry->rule.list);
+                                audit_panic("error updating watch, removing");
+                        } else {
+                                int h = audit_hash_ino((u32)ino);
+                                list_add(&nentry->rule.rlist, &nwatch->rules);
+                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+                                list_replace(&oentry->rule.list,
+                                             &nentry->rule.list);
+                        }
+                        audit_watch_log_rule_change(r, owatch, "updated rules");
+                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
+                }
+                audit_remove_watch(owatch);
+                goto add_watch_to_parent; /* event applies to a single watch */
+        }
+        mutex_unlock(&audit_filter_mutex);
+        return;
+add_watch_to_parent:
+        list_add(&nwatch->wlist, &parent->watches);
+        mutex_unlock(&audit_filter_mutex);
+        return;
+}
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+        struct audit_watch *w, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *e;
+        mutex_lock(&audit_filter_mutex);
+        parent->flags |= AUDIT_PARENT_INVALID;
+        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+                        e = container_of(r, struct audit_entry, rule);
+                        audit_watch_log_rule_change(r, w, "remove rule");
+                        list_del(&r->rlist);
+                        list_del(&r->list);
+                        list_del_rcu(&e->list);
+                        call_rcu(&e->rcu, audit_free_rule_rcu);
+                }
+                audit_remove_watch(w);
+        }
+        mutex_unlock(&audit_filter_mutex);
+}
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+void audit_inotify_unregister(struct list_head *in_list)
+{
+        struct audit_parent *p, *n;
+        list_for_each_entry_safe(p, n, in_list, ilist) {
+                list_del(&p->ilist);
+                inotify_rm_watch(audit_ih, &p->wdata);
+                /* the unpin matching the pin in audit_do_del_rule() */
+                unpin_inotify_watch(&p->wdata);
+        }
+}
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+{
+        struct nameidata *ndparent, *ndwatch;
+        int err;
+        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+        if (unlikely(!ndparent))
+                return -ENOMEM;
+        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+        if (unlikely(!ndwatch)) {
+                kfree(ndparent);
+                return -ENOMEM;
+        }
+        err = path_lookup(path, LOOKUP_PARENT, ndparent);
+        if (err) {
+                kfree(ndparent);
+                kfree(ndwatch);
+                return err;
+        }
+        err = path_lookup(path, 0, ndwatch);
+        if (err) {
+                kfree(ndwatch);
+                ndwatch = NULL;
+        }
+        *ndp = ndparent;
+        *ndw = ndwatch;
+        return 0;
+}
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+        if (ndp) {
+                path_put(&ndp->path);
+                kfree(ndp);
+        }
+        if (ndw) {
+                path_put(&ndw->path);
+                kfree(ndw);
+        }
+}
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+                                struct audit_parent *parent)
+{
+        struct audit_watch *w, *watch = krule->watch;
+        int watch_found = 0;
+        list_for_each_entry(w, &parent->watches, wlist) {
+                if (strcmp(watch->path, w->path))
+                        continue;
+                watch_found = 1;
+                /* put krule's and initial refs to temporary watch */
+                audit_put_watch(watch);
+                audit_put_watch(watch);
+                audit_get_watch(w);
+                krule->watch = watch = w;
+                break;
+        }
+        if (!watch_found) {
+                get_inotify_watch(&parent->wdata);
+                watch->parent = parent;
+                list_add(&watch->wlist, &parent->watches);
+        }
+        list_add(&krule->rlist, &watch->rules);
+}
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule)
+{
+        struct audit_watch *watch = krule->watch;
+        struct inotify_watch *i_watch;
+        struct audit_parent *parent;
+        struct nameidata *ndp = NULL, *ndw = NULL;
+        int ret = 0;
+        mutex_unlock(&audit_filter_mutex);
+        /* Avoid calling path_lookup under audit_filter_mutex. */
+        ret = audit_get_nd(watch->path, &ndp, &ndw);
+        if (ret) {
+                /* caller expects mutex locked */
+                mutex_lock(&audit_filter_mutex);
+                goto error;
+        }
+        /* update watch filter fields */
+        if (ndw) {
+                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+                watch->ino = ndw->path.dentry->d_inode->i_ino;
+        }
+        /* The audit_filter_mutex must not be held during inotify calls because
+         * we hold it during inotify event callback processing.  If an existing
+         * inotify watch is found, inotify_find_watch() grabs a reference before
+         * returning.
+         */
+        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+                               &i_watch) < 0) {
+                parent = audit_init_parent(ndp);
+                if (IS_ERR(parent)) {
+                        /* caller expects mutex locked */
+                        mutex_lock(&audit_filter_mutex);
+                        ret = PTR_ERR(parent);
+                        goto error;
+                }
+        } else
+                parent = container_of(i_watch, struct audit_parent, wdata);
+        mutex_lock(&audit_filter_mutex);
+        /* parent was moved before we took audit_filter_mutex */
+        if (parent->flags & AUDIT_PARENT_INVALID)
+                ret = -ENOENT;
+        else
+                audit_add_to_parent(krule, parent);
+        /* match get in audit_init_parent or inotify_find_watch */
+        put_inotify_watch(&parent->wdata);
+error:
+        audit_put_nd(ndp, ndw);         /* NULL args OK */
+        return ret;
+}
+void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+{
+        struct audit_watch *watch = krule->watch;
+        struct audit_parent *parent = watch->parent;
+        list_del(&krule->rlist);
+        if (list_empty(&watch->rules)) {
+                audit_remove_watch(watch);
+                if (list_empty(&parent->watches)) {
+                        /* Put parent on the inotify un-registration
+                         * list.  Grab a reference before releasing
+                         * audit_filter_mutex, to be released in
+                         * audit_inotify_unregister().
+                         * If filesystem is going away, just leave
+                         * the sucker alone, eviction will take
+                         * care of it. */
+                        if (pin_inotify_watch(&parent->wdata))
+                                list_add(&parent->ilist, list);
+                }
+        }
+}
+/* Update watch data in audit rules based on inotify events. */
+static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+                         u32 cookie, const char *dname, struct inode *inode)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+                audit_update_watch(parent, dname, inode->i_sb->s_dev,
+                                   inode->i_ino, 0);
+        else if (mask & (IN_DELETE|IN_MOVED_FROM))
+                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+        /* inotify automatically removes the watch and sends IN_IGNORED */
+        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+                audit_remove_parent_watches(parent);
+        /* inotify does not remove the watch, so remove it manually */
+        else if(mask & IN_MOVE_SELF) {
+                audit_remove_parent_watches(parent);
+                inotify_remove_watch_locked(audit_ih, i_watch);
+        } else if (mask & IN_IGNORED)
+                put_inotify_watch(i_watch);
+}
+static const struct inotify_operations audit_inotify_ops = {
+        .handle_event   = audit_handle_ievent,
+        .destroy_watch  = audit_free_parent,
+};
+static int __init audit_watch_init(void)
+{
+        audit_ih = inotify_init(&audit_inotify_ops);
+        if (IS_ERR(audit_ih))
+                audit_panic("cannot initialize inotify handle");
+        return 0;
+}
+subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
-#include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
@@ -44,36 +43,6 @@
 *              be written directly provided audit_filter_mutex is held.
 */
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- *      event.  Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- *      audit_remove_watch().  Additionally, an audit_watch may exist
- *      temporarily to assist in searching existing filter data.  Each
- *      audit_krule holds a reference to its associated watch.
- */
-struct audit_parent {
-        struct list_head        ilist;  /* entry in inotify registration list */
-        struct list_head        watches; /* associated watches */
-        struct inotify_watch    wdata;  /* inotify watch data */
-        unsigned                flags;  /* status flags */
-};
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID    0x001
 /* Audit filter lists, defined in <linux/audit.h> */
 struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-void audit_free_parent(struct inotify_watch *i_watch)
-{
-        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
-        WARN_ON(!list_empty(&parent->watches));
-        kfree(parent);
-}
-static inline void audit_get_watch(struct audit_watch *watch)
-{
-        atomic_inc(&watch->count);
-}
-static void audit_put_watch(struct audit_watch *watch)
-{
-        if (atomic_dec_and_test(&watch->count)) {
-                WARN_ON(watch->parent);
-                WARN_ON(!list_empty(&watch->rules));
-                kfree(watch->path);
-                kfree(watch);
-        }
-}
-static void audit_remove_watch(struct audit_watch *watch)
-{
-        list_del(&watch->wlist);
-        put_inotify_watch(&watch->parent->wdata);
-        watch->parent = NULL;
-        audit_put_watch(watch); /* match initial get */
-}
 static inline void audit_free_rule(struct audit_entry *e)
 {
        int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
        audit_free_rule(e);
 }
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
-        struct audit_parent *parent;
-        s32 wd;
-        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
-        if (unlikely(!parent))
-                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&parent->watches);
-        parent->flags = 0;
-        inotify_init_watch(&parent->wdata);
-        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
-        get_inotify_watch(&parent->wdata);
-        wd = inotify_add_watch(audit_ih, &parent->wdata,
-                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
-        if (wd < 0) {
-                audit_free_parent(&parent->wdata);
-                return ERR_PTR(wd);
-        }
-        return parent;
-}
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
-        struct audit_watch *watch;
-        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
-        if (unlikely(!watch))
-                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&watch->rules);
-        atomic_set(&watch->count, 1);
-        watch->path = path;
-        watch->dev = (dev_t)-1;
-        watch->ino = (unsigned long)-1;
-        return watch;
-}
 /* Initialize an audit filterlist entry. */
 static inline struct audit_entry *audit_init_entry(u32 field_count)
 {
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
        return 0;
 }
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
-                          u32 op)
-{
-        struct audit_watch *watch;
-        if (!audit_ih)
-                return -EOPNOTSUPP;
-        if (path[0] != '/' || path[len-1] == '/' ||
-            krule->listnr != AUDIT_FILTER_EXIT ||
-            op != Audit_equal ||
-            krule->inode_f || krule->watch || krule->tree)
-                return -EINVAL;
-        watch = audit_init_watch(path);
-        if (IS_ERR(watch))
-                return PTR_ERR(watch);
-        audit_get_watch(watch);
-        krule->watch = watch;
-        return 0;
-}
 static __u32 *classes[AUDIT_SYSCALL_CLASSES];
 int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        break;
                case AUDIT_WATCH:
                        data->buflen += data->values[i] =
-                                audit_pack_string(&bufp, krule->watch->path);
+                                audit_pack_string(&bufp,
+                                                  audit_watch_path(krule->watch));
                        break;
                case AUDIT_DIR:
                        data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                                return 1;
                        break;
                case AUDIT_WATCH:
-                        if (strcmp(a->watch->path, b->watch->path))
+                        if (strcmp(audit_watch_path(a->watch),
+                                   audit_watch_path(b->watch)))
                                return 1;
                        break;
                case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
        return 0;
 }
-/* Duplicate the given audit watch.  The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
-        char *path;
-        struct audit_watch *new;
-        path = kstrdup(old->path, GFP_KERNEL);
-        if (unlikely(!path))
-                return ERR_PTR(-ENOMEM);
-        new = audit_init_watch(path);
-        if (IS_ERR(new)) {
-                kfree(path);
-                goto out;
-        }
-        new->dev = old->dev;
-        new->ino = old->ino;
-        get_inotify_watch(&old->parent->wdata);
-        new->parent = old->parent;
-out:
-        return new;
-}
 /* Duplicate LSM field information.  The lsm_rule is opaque, so must be
 * re-initialized. */
 static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
 * rule with the new rule in the filterlist, then free the old rule.
 * The rlist element is undefined; list manipulations are handled apart from
 * the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
-                                           struct audit_watch *watch)
+                                    struct audit_watch *watch)
 {
        u32 fcount = old->field_count;
        struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
        return entry;
 }
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
-                               const char *dname, dev_t dev,
-                               unsigned long ino, unsigned invalidating)
-{
-        struct audit_watch *owatch, *nwatch, *nextw;
-        struct audit_krule *r, *nextr;
-        struct audit_entry *oentry, *nentry;
-        mutex_lock(&audit_filter_mutex);
-        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-                if (audit_compare_dname_path(dname, owatch->path, NULL))
-                        continue;
-                /* If the update involves invalidating rules, do the inode-based
-                 * filtering now, so we don't omit records. */
-                if (invalidating && current->audit_context)
-                        audit_filter_inodes(current, current->audit_context);
-                nwatch = audit_dupe_watch(owatch);
-                if (IS_ERR(nwatch)) {
-                        mutex_unlock(&audit_filter_mutex);
-                        audit_panic("error updating watch, skipping");
-                        return;
-                }
-                nwatch->dev = dev;
-                nwatch->ino = ino;
-                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-                        oentry = container_of(r, struct audit_entry, rule);
-                        list_del(&oentry->rule.rlist);
-                        list_del_rcu(&oentry->list);
-                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
-                        if (IS_ERR(nentry)) {
-                                list_del(&oentry->rule.list);
-                                audit_panic("error updating watch, removing");
-                        } else {
-                                int h = audit_hash_ino((u32)ino);
-                                list_add(&nentry->rule.rlist, &nwatch->rules);
-                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
-                                list_replace(&oentry->rule.list,
-                                             &nentry->rule.list);
-                        }
-                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
-                }
-                if (audit_enabled) {
-                        struct audit_buffer *ab;
-                        ab = audit_log_start(NULL, GFP_NOFS,
-                                AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "auid=%u ses=%u",
-                                audit_get_loginuid(current),
-                                audit_get_sessionid(current));
-                        audit_log_format(ab,
-                                " op=updated rules specifying path=");
-                        audit_log_untrustedstring(ab, owatch->path);
-                        audit_log_format(ab, " with dev=%u ino=%lu\n",
-                                 dev, ino);
-                        audit_log_format(ab, " list=%d res=1", r->listnr);
-                        audit_log_end(ab);
-                }
-                audit_remove_watch(owatch);
-                goto add_watch_to_parent; /* event applies to a single watch */
-        }
-        mutex_unlock(&audit_filter_mutex);
-        return;
-add_watch_to_parent:
-        list_add(&nwatch->wlist, &parent->watches);
-        mutex_unlock(&audit_filter_mutex);
-        return;
-}
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
-        struct audit_watch *w, *nextw;
-        struct audit_krule *r, *nextr;
-        struct audit_entry *e;
-        mutex_lock(&audit_filter_mutex);
-        parent->flags |= AUDIT_PARENT_INVALID;
-        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
-                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
-                        e = container_of(r, struct audit_entry, rule);
-                        if (audit_enabled) {
-                                struct audit_buffer *ab;
-                                ab = audit_log_start(NULL, GFP_NOFS,
-                                        AUDIT_CONFIG_CHANGE);
-                                audit_log_format(ab, "auid=%u ses=%u",
-                                        audit_get_loginuid(current),
-                                        audit_get_sessionid(current));
-                                audit_log_format(ab, " op=remove rule path=");
-                                audit_log_untrustedstring(ab, w->path);
-                                if (r->filterkey) {
-                                        audit_log_format(ab, " key=");
-                                        audit_log_untrustedstring(ab,
-                                                        r->filterkey);
-                                } else
-                                        audit_log_format(ab, " key=(null)");
-                                audit_log_format(ab, " list=%d res=1",
-                                        r->listnr);
-                                audit_log_end(ab);
-                        }
-                        list_del(&r->rlist);
-                        list_del(&r->list);
-                        list_del_rcu(&e->list);
-                        call_rcu(&e->rcu, audit_free_rule_rcu);
-                }
-                audit_remove_watch(w);
-        }
-        mutex_unlock(&audit_filter_mutex);
-}
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
-        struct audit_parent *p, *n;
-        list_for_each_entry_safe(p, n, in_list, ilist) {
-                list_del(&p->ilist);
-                inotify_rm_watch(audit_ih, &p->wdata);
-                /* the unpin matching the pin in audit_do_del_rule() */
-                unpin_inotify_watch(&p->wdata);
-        }
-}
 /* Find an existing audit rule.
 * Caller must hold audit_filter_mutex to prevent stale rule data. */
 static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
        return found;
 }
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
-                        struct nameidata **ndw)
-{
-        struct nameidata *ndparent, *ndwatch;
-        int err;
-        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
-        if (unlikely(!ndparent))
-                return -ENOMEM;
-        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
-        if (unlikely(!ndwatch)) {
-                kfree(ndparent);
-                return -ENOMEM;
-        }
-        err = path_lookup(path, LOOKUP_PARENT, ndparent);
-        if (err) {
-                kfree(ndparent);
-                kfree(ndwatch);
-                return err;
-        }
-        err = path_lookup(path, 0, ndwatch);
-        if (err) {
-                kfree(ndwatch);
-                ndwatch = NULL;
-        }
-        *ndp = ndparent;
-        *ndw = ndwatch;
-        return 0;
-}
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-        if (ndp) {
-                path_put(&ndp->path);
-                kfree(ndp);
-        }
-        if (ndw) {
-                path_put(&ndw->path);
-                kfree(ndw);
-        }
-}
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
-                                struct audit_parent *parent)
-{
-        struct audit_watch *w, *watch = krule->watch;
-        int watch_found = 0;
-        list_for_each_entry(w, &parent->watches, wlist) {
-                if (strcmp(watch->path, w->path))
-                        continue;
-                watch_found = 1;
-                /* put krule's and initial refs to temporary watch */
-                audit_put_watch(watch);
-                audit_put_watch(watch);
-                audit_get_watch(w);
-                krule->watch = watch = w;
-                break;
-        }
-        if (!watch_found) {
-                get_inotify_watch(&parent->wdata);
-                watch->parent = parent;
-                list_add(&watch->wlist, &parent->watches);
-        }
-        list_add(&krule->rlist, &watch->rules);
-}
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
-                           struct nameidata *ndw)
-{
-        struct audit_watch *watch = krule->watch;
-        struct inotify_watch *i_watch;
-        struct audit_parent *parent;
-        int ret = 0;
-        /* update watch filter fields */
-        if (ndw) {
-                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-                watch->ino = ndw->path.dentry->d_inode->i_ino;
-        }
-        /* The audit_filter_mutex must not be held during inotify calls because
-         * we hold it during inotify event callback processing.  If an existing
-         * inotify watch is found, inotify_find_watch() grabs a reference before
-         * returning.
-         */
-        mutex_unlock(&audit_filter_mutex);
-        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-                               &i_watch) < 0) {
-                parent = audit_init_parent(ndp);
-                if (IS_ERR(parent)) {
-                        /* caller expects mutex locked */
-                        mutex_lock(&audit_filter_mutex);
-                        return PTR_ERR(parent);
-                }
-        } else
-                parent = container_of(i_watch, struct audit_parent, wdata);
-        mutex_lock(&audit_filter_mutex);
-        /* parent was moved before we took audit_filter_mutex */
-        if (parent->flags & AUDIT_PARENT_INVALID)
-                ret = -ENOENT;
-        else
-                audit_add_to_parent(krule, parent);
-        /* match get in audit_init_parent or inotify_find_watch */
-        put_inotify_watch(&parent->wdata);
-        return ret;
-}
 static u64 prio_low = ~0ULL/2;
 static u64 prio_high = ~0ULL/2 - 1;
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
        struct audit_entry *e;
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
-        struct nameidata *ndp = NULL, *ndw = NULL;
        struct list_head *list;
        int h, err;
 #ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
-        mutex_unlock(&audit_filter_mutex);
        if (e) {
+                mutex_unlock(&audit_filter_mutex);
                err = -EEXIST;
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
                goto error;
        }
-        /* Avoid calling path_lookup under audit_filter_mutex. */
-        if (watch) {
-                err = audit_get_nd(watch->path, &ndp, &ndw);
-                if (err)
-                        goto error;
-        }
-        mutex_lock(&audit_filter_mutex);
        if (watch) {
                /* audit_filter_mutex is dropped and re-taken during this call */
-                err = audit_add_watch(&entry->rule, ndp, ndw);
+                err = audit_add_watch(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        goto error;
                }
-                h = audit_hash_ino((u32)watch->ino);
+                /* entry->rule.watch may have changed during audit_add_watch() */
+                watch = entry->rule.watch;
+                h = audit_hash_ino((u32)audit_watch_inode(watch));
                list = &audit_inode_hash[h];
        }
        if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #endif
        mutex_unlock(&audit_filter_mutex);
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
        return 0;
 error:
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
        if (watch)
                audit_put_watch(watch); /* tmp watch, matches initial get */
        return err;
@@ -1372,7 +945,7 @@ error:
 static inline int audit_del_rule(struct audit_entry *entry)
 {
        struct audit_entry  *e;
-        struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
                goto out;
        }
-        watch = e->rule.watch;
+        if (e->rule.watch)
-        if (watch) {
+                audit_remove_watch_rule(&e->rule, &inotify_list);
-                struct audit_parent *parent = watch->parent;
-                list_del(&e->rule.rlist);
-                if (list_empty(&watch->rules)) {
-                        audit_remove_watch(watch);
-                        if (list_empty(&parent->watches)) {
-                                /* Put parent on the inotify un-registration
-                                 * list.  Grab a reference before releasing
-                                 * audit_filter_mutex, to be released in
-                                 * audit_inotify_unregister().
-                                 * If filesystem is going away, just leave
-                                 * the sucker alone, eviction will take
-                                 * care of it.
-                                 */
-                                if (pin_inotify_watch(&parent->wdata))
-                                        list_add(&parent->ilist, &inotify_list);
-                        }
-                }
-        }
        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
                audit_inotify_unregister(&inotify_list);
 out:
-        if (tmp_watch)
+        if (watch)
-                audit_put_watch(tmp_watch); /* match initial get */
+                audit_put_watch(watch); /* match initial get */
        if (tree)
                audit_put_tree(tree);   /* that's the temporary one */
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
                        security_release_secctx(ctx, len);
                }
        }
-        audit_log_format(ab, " op=%s rule key=", action);
+        audit_log_format(ab, " op=");
-        if (rule->filterkey)
+        audit_log_string(ab, action);
-                audit_log_untrustedstring(ab, rule->filterkey);
+        audit_log_key(ab, rule->filterkey);
-        else
-                audit_log_format(ab, "(null)");
        audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
        audit_log_end(ab);
 }
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                        return PTR_ERR(entry);
                err = audit_add_rule(entry);
-                audit_log_rule_change(loginuid, sessionid, sid, "add",
+                audit_log_rule_change(loginuid, sessionid, sid, "add rule",
                                      &entry->rule, !err);
                if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                        return PTR_ERR(entry);
                err = audit_del_rule(entry);
-                audit_log_rule_change(loginuid, sessionid, sid, "remove",
+                audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
                                      &entry->rule, !err);
                audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
                list_del(&r->list);
        } else {
                if (watch) {
-                        list_add(&nentry->rule.rlist, &watch->rules);
+                        list_add(&nentry->rule.rlist, audit_watch_rules(watch));
                        list_del(&r->rlist);
                } else if (tree)
                        list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
        return err;
 }
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
-                         u32 cookie, const char *dname, struct inode *inode)
-{
-        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
-        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
-                audit_update_watch(parent, dname, inode->i_sb->s_dev,
-                                   inode->i_ino, 0);
-        else if (mask & (IN_DELETE|IN_MOVED_FROM))
-                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-        /* inotify automatically removes the watch and sends IN_IGNORED */
-        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
-                audit_remove_parent_watches(parent);
-        /* inotify does not remove the watch, so remove it manually */
-        else if(mask & IN_MOVE_SELF) {
-                audit_remove_parent_watches(parent);
-                inotify_remove_watch_locked(audit_ih, i_watch);
-        } else if (mask & IN_IGNORED)
-                put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
        struct audit_tree_refs *trees, *first_trees;
        int tree_count;
+        struct list_head killed_trees;
        int type;
        union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_WATCH:
-                        if (name && rule->watch->ino != (unsigned long)-1)
+                        if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
-                                result = (name->dev == rule->watch->dev &&
+                                result = (name->dev == audit_watch_dev(rule->watch) &&
-                                          name->ino == rule->watch->ino);
+                                          name->ino == audit_watch_inode(rule->watch));
                        break;
                case AUDIT_DIR:
                        if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
        if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
                return NULL;
        audit_zero_context(context, state);
+        INIT_LIST_HEAD(&context->killed_trees);
        return context;
 }
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
        char arg_num_len_buf[12];
        const char __user *tmp_p = p;
-        /* how many digits are in arg_num? 3 is the length of " a=" */
+        /* how many digits are in arg_num? 5 is the length of ' a=""' */
-        size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+        size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
        size_t len, len_left, to_send;
        size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
        unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
                if (has_cntl)
                        audit_log_n_hex(*ab, buf, to_send);
                else
-                        audit_log_format(*ab, "\"%s\"", buf);
+                        audit_log_string(*ab, buf);
                p += to_send;
                len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        audit_log_task_info(ab, tsk);
-        if (context->filterkey) {
+        audit_log_key(ab, context->filterkey);
-                audit_log_format(ab, " key=");
-                audit_log_untrustedstring(ab, context->filterkey);
-        } else
-                audit_log_format(ab, " key=(null)");
        audit_log_end(ab);
        for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
        /* that can happen only if we are called from do_exit() */
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
                audit_log_exit(context, tsk);
+        if (!list_empty(&context->killed_trees))
+                audit_kill_trees(&context->killed_trees);
        audit_free_context(context);
 }
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
        context->in_syscall = 0;
        context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+        if (!list_empty(&context->killed_trees))
+                audit_kill_trees(&context->killed_trees);
        if (context->previous) {
                struct audit_context *new_context = context->previous;
                context->previous  = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
        audit_log_format(ab, " sig=%ld", signr);
        audit_log_end(ab);
 }
+struct list_head *audit_killed_trees(void)
+{
+        struct audit_context *ctx = current->audit_context;
+        if (likely(!ctx || !ctx->in_syscall))
+                return NULL;
+        return &ctx->killed_trees;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
-#include <linux/mnt_namespace.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..bd2959228871 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
-#include <linux/mnt_namespace.h>
 #include <linux/personality.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..794c862125fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
        drop_futex_key_refs(key);
 }
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr:      pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+                                 1, 1, 0, NULL, NULL);
+        return ret < 0 ? ret : 0;
+}
 /**
 * futex_top_waiter() - Return the highest priority waiter on a futex
 * @hb:     the hash bucket the futex_q's reside in
@@ -896,7 +915,6 @@ retry:
 retry_private:
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
-                u32 dummy;
                double_unlock_hb(hb1, hb2);
@@ -914,7 +932,7 @@ retry_private:
                        goto out_put_keys;
                }
-                ret = get_user(dummy, uaddr2);
+                ret = fault_in_user_writeable(uaddr2);
                if (ret)
                        goto out_put_keys;
@@ -1204,7 +1222,7 @@ retry_private:
                        double_unlock_hb(hb1, hb2);
                        put_futex_key(fshared, &key2);
                        put_futex_key(fshared, &key1);
-                        ret = get_user(curval2, uaddr2);
+                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
                        goto out;
@@ -1482,7 +1500,7 @@ retry:
 handle_fault:
        spin_unlock(q->lock_ptr);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        spin_lock(q->lock_ptr);
@@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-        u32 uval;
        struct futex_q q;
        int res, ret;
@@ -1909,16 +1926,9 @@ out:
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 uaddr_faulted:
-        /*
-         * We have to r/w  *(int __user *)uaddr, and we have to modify it
-         * atomically.  Therefore, if we continue to fault after get_user()
-         * below, we need to handle the fault ourselves, while still holding
-         * the mmap_sem.  This can occur if the uaddr is under contention as
-         * we have to drop the mmap_sem in order to call get_user().
-         */
        queue_unlock(&q, hb);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        if (ret)
                goto out_put_key;
@@ -2013,17 +2023,10 @@ out:
        return ret;
 pi_faulted:
-        /*
-         * We have to r/w  *(int __user *)uaddr, and we have to modify it
-         * atomically.  Therefore, if we continue to fault after get_user()
-         * below, we need to handle the fault ourselves, while still holding
-         * the mmap_sem.  This can occur if the uaddr is under contention as
-         * we have to drop the mmap_sem in order to call get_user().
-         */
        spin_unlock(&hb->lock);
        put_futex_key(fshared, &key);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        if (!ret)
                goto retry;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
 #include <linux/unistd.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
-#include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..0a049837008e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2451,9 +2451,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
                return ret;
        }
        if (ret > 0) {
-                printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+                printk(KERN_WARNING
-                                    "it should follow 0/-E convention\n"
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-                       KERN_WARNING "%s: loading module anyway...\n",
+"%s: loading module anyway...\n",
                       __func__, mod->name, ret,
                       __func__);
                dump_stack();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea4..a641eb753b8c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -236,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
        list_add_rcu(&counter->event_entry, &ctx->event_list);
        ctx->nr_counters++;
+        if (counter->attr.inherit_stat)
+                ctx->nr_stat++;
 }
 /*
@@ -250,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
        if (list_empty(&counter->list_entry))
                return;
        ctx->nr_counters--;
+        if (counter->attr.inherit_stat)
+                ctx->nr_stat--;
        list_del_init(&counter->list_entry);
        list_del_rcu(&counter->event_entry);
@@ -1006,6 +1010,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
                && !ctx1->pin_count && !ctx2->pin_count;
 }
+static void __perf_counter_read(void *counter);
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+                                     struct perf_counter *next_counter)
+{
+        u64 value;
+        if (!counter->attr.inherit_stat)
+                return;
+        /*
+         * Update the counter value, we cannot use perf_counter_read()
+         * because we're in the middle of a context switch and have IRQs
+         * disabled, which upsets smp_call_function_single(), however
+         * we know the counter must be on the current CPU, therefore we
+         * don't need to use it.
+         */
+        switch (counter->state) {
+        case PERF_COUNTER_STATE_ACTIVE:
+                __perf_counter_read(counter);
+                break;
+        case PERF_COUNTER_STATE_INACTIVE:
+                update_counter_times(counter);
+                break;
+        default:
+                break;
+        }
+        /*
+         * In order to keep per-task stats reliable we need to flip the counter
+         * values when we flip the contexts.
+         */
+        value = atomic64_read(&next_counter->count);
+        value = atomic64_xchg(&counter->count, value);
+        atomic64_set(&next_counter->count, value);
+        swap(counter->total_time_enabled, next_counter->total_time_enabled);
+        swap(counter->total_time_running, next_counter->total_time_running);
+        /*
+         * Since we swizzled the values, update the user visible data too.
+         */
+        perf_counter_update_userpage(counter);
+        perf_counter_update_userpage(next_counter);
+}
+#define list_next_entry(pos, member) \
+        list_entry(pos->member.next, typeof(*pos), member)
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+                                   struct perf_counter_context *next_ctx)
+{
+        struct perf_counter *counter, *next_counter;
+        if (!ctx->nr_stat)
+                return;
+        counter = list_first_entry(&ctx->event_list,
+                                   struct perf_counter, event_entry);
+        next_counter = list_first_entry(&next_ctx->event_list,
+                                        struct perf_counter, event_entry);
+        while (&counter->event_entry != &ctx->event_list &&
+               &next_counter->event_entry != &next_ctx->event_list) {
+                __perf_counter_sync_stat(counter, next_counter);
+                counter = list_next_entry(counter, event_entry);
+                next_counter = list_next_entry(counter, event_entry);
+        }
+}
 /*
 * Called from scheduler to remove the counters of the current task,
 * with interrupts disabled.
@@ -1061,6 +1140,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
                        ctx->task = next;
                        next_ctx->task = task;
                        do_switch = 0;
+                        perf_counter_sync_stat(ctx, next_ctx);
                }
                spin_unlock(&next_ctx->lock);
                spin_unlock(&ctx->lock);
@@ -1348,9 +1429,56 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 }
 /*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+        struct perf_counter_context *ctx;
+        struct perf_counter *counter;
+        unsigned long flags;
+        int enabled = 0;
+        local_irq_save(flags);
+        ctx = task->perf_counter_ctxp;
+        if (!ctx || !ctx->nr_counters)
+                goto out;
+        __perf_counter_task_sched_out(ctx);
+        spin_lock(&ctx->lock);
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                if (!counter->attr.enable_on_exec)
+                        continue;
+                counter->attr.enable_on_exec = 0;
+                if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+                        continue;
+                counter->state = PERF_COUNTER_STATE_INACTIVE;
+                counter->tstamp_enabled =
+                        ctx->time - counter->total_time_enabled;
+                enabled = 1;
+        }
+        /*
+         * Unclone this context if we enabled any counter.
+         */
+        if (enabled && ctx->parent_ctx) {
+                put_ctx(ctx->parent_ctx);
+                ctx->parent_ctx = NULL;
+        }
+        spin_unlock(&ctx->lock);
+        perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+        local_irq_restore(flags);
+}
+/*
 * Cross CPU call to read the hardware counter
 */
-static void __read(void *info)
+static void __perf_counter_read(void *info)
 {
        struct perf_counter *counter = info;
        struct perf_counter_context *ctx = counter->ctx;
@@ -1372,7 +1500,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
         */
        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
                smp_call_function_single(counter->oncpu,
-                                         __read, counter, 1);
+                                         __perf_counter_read, counter, 1);
        } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
                update_counter_times(counter);
        }
@@ -1508,11 +1636,13 @@ static void free_counter(struct perf_counter *counter)
 {
        perf_pending_sync(counter);
-        atomic_dec(&nr_counters);
+        if (!counter->parent) {
-        if (counter->attr.mmap)
+                atomic_dec(&nr_counters);
-                atomic_dec(&nr_mmap_counters);
+                if (counter->attr.mmap)
-        if (counter->attr.comm)
+                        atomic_dec(&nr_mmap_counters);
-                atomic_dec(&nr_comm_counters);
+                if (counter->attr.comm)
+                        atomic_dec(&nr_comm_counters);
+        }
        if (counter->destroy)
                counter->destroy(counter);
@@ -1751,6 +1881,14 @@ int perf_counter_task_disable(void)
        return 0;
 }
+static int perf_counter_index(struct perf_counter *counter)
+{
+        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                return 0;
+        return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
+}
 /*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1775,11 +1913,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
        preempt_disable();
        ++userpg->lock;
        barrier();
-        userpg->index = counter->hw.idx;
+        userpg->index = perf_counter_index(counter);
        userpg->offset = atomic64_read(&counter->count);
        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
                userpg->offset -= atomic64_read(&counter->hw.prev_count);
+        userpg->time_enabled = counter->total_time_enabled +
+                        atomic64_read(&counter->child_total_time_enabled);
+        userpg->time_running = counter->total_time_running +
+                        atomic64_read(&counter->child_total_time_running);
        barrier();
        ++userpg->lock;
        preempt_enable();
@@ -1876,7 +2020,7 @@ fail:
 static void perf_mmap_free_page(unsigned long addr)
 {
-        struct page *page = virt_to_page(addr);
+        struct page *page = virt_to_page((void *)addr);
        page->mapping = NULL;
        __free_page(page);
@@ -2483,15 +2627,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                u32 cpu, reserved;
        } cpu_entry;
-        header.type = 0;
+        header.type = PERF_EVENT_SAMPLE;
        header.size = sizeof(header);
-        header.misc = PERF_EVENT_MISC_OVERFLOW;
+        header.misc = 0;
        header.misc |= perf_misc_flags(data->regs);
        if (sample_type & PERF_SAMPLE_IP) {
                ip = perf_instruction_pointer(data->regs);
-                header.type |= PERF_SAMPLE_IP;
                header.size += sizeof(ip);
        }
@@ -2500,7 +2643,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                tid_entry.pid = perf_counter_pid(counter, current);
                tid_entry.tid = perf_counter_tid(counter, current);
-                header.type |= PERF_SAMPLE_TID;
                header.size += sizeof(tid_entry);
        }
@@ -2510,34 +2652,25 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                 */
                time = sched_clock();
-                header.type |= PERF_SAMPLE_TIME;
                header.size += sizeof(u64);
        }
-        if (sample_type & PERF_SAMPLE_ADDR) {
+        if (sample_type & PERF_SAMPLE_ADDR)
-                header.type |= PERF_SAMPLE_ADDR;
                header.size += sizeof(u64);
-        }
-        if (sample_type & PERF_SAMPLE_ID) {
+        if (sample_type & PERF_SAMPLE_ID)
-                header.type |= PERF_SAMPLE_ID;
                header.size += sizeof(u64);
-        }
        if (sample_type & PERF_SAMPLE_CPU) {
-                header.type |= PERF_SAMPLE_CPU;
                header.size += sizeof(cpu_entry);
                cpu_entry.cpu = raw_smp_processor_id();
        }
-        if (sample_type & PERF_SAMPLE_PERIOD) {
+        if (sample_type & PERF_SAMPLE_PERIOD)
-                header.type |= PERF_SAMPLE_PERIOD;
                header.size += sizeof(u64);
-        }
        if (sample_type & PERF_SAMPLE_GROUP) {
-                header.type |= PERF_SAMPLE_GROUP;
                header.size += sizeof(u64) +
                        counter->nr_siblings * sizeof(group_entry);
        }
@@ -2547,10 +2680,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                if (callchain) {
                        callchain_size = (1 + callchain->nr) * sizeof(u64);
-                        header.type |= PERF_SAMPLE_CALLCHAIN;
                        header.size += callchain_size;
-                }
+                } else
+                        header.size += sizeof(u64);
        }
        ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2601,13 +2733,79 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                }
        }
-        if (callchain)
+        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-                perf_output_copy(&handle, callchain, callchain_size);
+                if (callchain)
+                        perf_output_copy(&handle, callchain, callchain_size);
+                else {
+                        u64 nr = 0;
+                        perf_output_put(&handle, nr);
+                }
+        }
        perf_output_end(&handle);
 }
 /*
+ * read event
+ */
+struct perf_read_event {
+        struct perf_event_header        header;
+        u32                             pid;
+        u32                             tid;
+        u64                             value;
+        u64                             format[3];
+};
+static void
+perf_counter_read_event(struct perf_counter *counter,
+                        struct task_struct *task)
+{
+        struct perf_output_handle handle;
+        struct perf_read_event event = {
+                .header = {
+                        .type = PERF_EVENT_READ,
+                        .misc = 0,
+                        .size = sizeof(event) - sizeof(event.format),
+                },
+                .pid = perf_counter_pid(counter, task),
+                .tid = perf_counter_tid(counter, task),
+                .value = atomic64_read(&counter->count),
+        };
+        int ret, i = 0;
+        if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+                event.header.size += sizeof(u64);
+                event.format[i++] = counter->total_time_enabled;
+        }
+        if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+                event.header.size += sizeof(u64);
+                event.format[i++] = counter->total_time_running;
+        }
+        if (counter->attr.read_format & PERF_FORMAT_ID) {
+                u64 id;
+                event.header.size += sizeof(u64);
+                if (counter->parent)
+                        id = counter->parent->id;
+                else
+                        id = counter->id;
+                event.format[i++] = id;
+        }
+        ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+        if (ret)
+                return;
+        perf_output_copy(&handle, &event, event.header.size);
+        perf_output_end(&handle);
+}
+/*
 * fork tracking
 */
@@ -2798,6 +2996,9 @@ void perf_counter_comm(struct task_struct *task)
 {
        struct perf_comm_event comm_event;
+        if (task->perf_counter_ctxp)
+                perf_counter_enable_on_exec(task);
        if (!atomic_read(&nr_comm_counters))
                return;
@@ -3317,8 +3518,8 @@ out:
        put_cpu_var(perf_cpu_context);
 }
-void
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+                            struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data = {
                .regs = regs,
@@ -3509,9 +3710,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 }
 #endif
+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+        u64 event = counter->attr.config;
+        WARN_ON(counter->parent);
+        atomic_dec(&perf_swcounter_enabled[event]);
+}
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
        const struct pmu *pmu = NULL;
+        u64 event = counter->attr.config;
        /*
         * Software counters (currently) can't in general distinguish
@@ -3520,7 +3733,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
         * to be kernel events, and page faults are never hypervisor
         * events.
         */
-        switch (counter->attr.config) {
+        switch (event) {
        case PERF_COUNT_SW_CPU_CLOCK:
                pmu = &perf_ops_cpu_clock;
@@ -3541,6 +3754,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
        case PERF_COUNT_SW_CONTEXT_SWITCHES:
        case PERF_COUNT_SW_CPU_MIGRATIONS:
+                if (!counter->parent) {
+                        atomic_inc(&perf_swcounter_enabled[event]);
+                        counter->destroy = sw_perf_counter_destroy;
+                }
                pmu = &perf_ops_generic;
                break;
        }
@@ -3556,6 +3773,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
                   int cpu,
                   struct perf_counter_context *ctx,
                   struct perf_counter *group_leader,
+                   struct perf_counter *parent_counter,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -3591,6 +3809,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
        counter->ctx            = ctx;
        counter->oncpu          = -1;
+        counter->parent         = parent_counter;
        counter->ns             = get_pid_ns(current->nsproxy->pid_ns);
        counter->id             = atomic64_inc_return(&perf_counter_id);
@@ -3648,11 +3868,13 @@ done:
        counter->pmu = pmu;
-        atomic_inc(&nr_counters);
+        if (!counter->parent) {
-        if (counter->attr.mmap)
+                atomic_inc(&nr_counters);
-                atomic_inc(&nr_mmap_counters);
+                if (counter->attr.mmap)
-        if (counter->attr.comm)
+                        atomic_inc(&nr_mmap_counters);
-                atomic_inc(&nr_comm_counters);
+                if (counter->attr.comm)
+                        atomic_inc(&nr_comm_counters);
+        }
        return counter;
 }
@@ -3815,7 +4037,7 @@ SYSCALL_DEFINE5(perf_counter_open,
        }
        counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
-                                     GFP_KERNEL);
+                                     NULL, GFP_KERNEL);
        ret = PTR_ERR(counter);
        if (IS_ERR(counter))
                goto err_put_context;
@@ -3881,7 +4103,8 @@ inherit_counter(struct perf_counter *parent_counter,
        child_counter = perf_counter_alloc(&parent_counter->attr,
                                           parent_counter->cpu, child_ctx,
-                                           group_leader, GFP_KERNEL);
+                                           group_leader, parent_counter,
+                                           GFP_KERNEL);
        if (IS_ERR(child_counter))
                return child_counter;
        get_ctx(child_ctx);
@@ -3904,12 +4127,6 @@ inherit_counter(struct perf_counter *parent_counter,
         */
        add_counter_to_ctx(child_counter, child_ctx);
-        child_counter->parent = parent_counter;
-        /*
-         * inherit into child's child as well:
-         */
-        child_counter->attr.inherit = 1;
        /*
         * Get a reference to the parent filp - we will fput it
         * when the child counter exits. This is safe to do because
@@ -3953,10 +4170,14 @@ static int inherit_group(struct perf_counter *parent_counter,
 }
 static void sync_child_counter(struct perf_counter *child_counter,
-                               struct perf_counter *parent_counter)
+                               struct task_struct *child)
 {
+        struct perf_counter *parent_counter = child_counter->parent;
        u64 child_val;
+        if (child_counter->attr.inherit_stat)
+                perf_counter_read_event(child_counter, child);
        child_val = atomic64_read(&child_counter->count);
        /*
@@ -3985,7 +4206,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
 static void
 __perf_counter_exit_task(struct perf_counter *child_counter,
-                         struct perf_counter_context *child_ctx)
+                         struct perf_counter_context *child_ctx,
+                         struct task_struct *child)
 {
        struct perf_counter *parent_counter;
@@ -3999,7 +4221,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
         * counters need to be zapped - but otherwise linger.
         */
        if (parent_counter) {
-                sync_child_counter(child_counter, parent_counter);
+                sync_child_counter(child_counter, child);
                free_counter(child_counter);
        }
 }
@@ -4061,7 +4283,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
        list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
                                 list_entry)
-                __perf_counter_exit_task(child_counter, child_ctx);
+                __perf_counter_exit_task(child_counter, child_ctx, child);
        /*
         * If the last counter was a group counter, it will have appended all
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..5fa1db48d8b7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/kmemleak.h>
 #define pid_hashfn(nr, ns)      \
        hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -512,6 +513,12 @@ void __init pidhash_init(void)
        pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
        if (!pid_hash)
                panic("Could not alloc pidhash!\n");
+        /*
+         * pid_hash contains references to allocated struct pid objects and it
+         * must be scanned by kmemleak to avoid false positives.
+         */
+        kmemleak_alloc(pid_hash, pidhash_size * sizeof(*(pid_hash)), 0,
+                       GFP_KERNEL);
        for (i = 0; i < pidhash_size; i++)
                INIT_HLIST_HEAD(&pid_hash[i]);
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
         * interference; SUID, SGID and LSM creds get determined differently
         * under ptrace.
         */
-        retval = mutex_lock_interruptible(&task->cred_guard_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&task->cred_guard_mutex))
                goto out;
        task_lock(task);
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
        static struct resource reserve[MAXRESERVE];
        for (;;) {
-                int io_start, io_num;
+                unsigned int io_start, io_num;
                int x = reserved;
                if (get_option (&str, &io_start) != 2)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..98e02328c67d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_timer_migration,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
 #endif
        {
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "panic_on_io_nmi",
+                .data           = &panic_on_io_nmi,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = KERN_BOOTLOADER_TYPE,
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
 /*
 * Collection status, active/inactive:
 */
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
 /*
 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        struct entry *entry, input;
        unsigned long flags;
-        if (likely(!active))
+        if (likely(!timer_stats_active))
                return;
        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        input.timer_flag = timer_flag;
        spin_lock_irqsave(lock, flags);
-        if (!active)
+        if (!timer_stats_active)
                goto out_unlock;
        entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
        /*
         * If still active then calculate up to now:
         */
-        if (active)
+        if (timer_stats_active)
                time_stop = ktime_get();
        time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
        mutex_lock(&show_mutex);
        switch (ctl[0]) {
        case '0':
-                if (active) {
+                if (timer_stats_active) {
-                        active = 0;
+                        timer_stats_active = 0;
                        time_stop = ktime_get();
                        sync_access();
                }
                break;
        case '1':
-                if (!active) {
+                if (!timer_stats_active) {
                        reset_entries();
                        time_start = ktime_get();
                        smp_mb();
-                        active = 1;
+                        timer_stats_active = 1;
                }
                break;
        default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..0b36b9e5cc8b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
 {
        unsigned int flag = 0;
+        if (likely(!timer->start_site))
+                return;
        if (unlikely(tbase_get_deferrable(timer->base)))
                flag |= TIMER_STATS_FLAG_DEFERRABLE;