Merge branch 'next' into for-linus

author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2009-09-14 00:16:56 -0400
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2009-09-14 00:16:56 -0400
commit: fc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
tree: f3cb97c4769b74f6627a59769f1ed5c92a13c58a /kernel
parent: 2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent: 9de48cc300fb10f7d9faa978670becf5e352462a (diff)
136 files changed, 16537 insertions, 5450 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
            async.o
+obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -68,8 +69,9 @@ obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
@@ -93,8 +95,11 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 static int acct_on(char *name)
 {
        struct file *file;
+        struct vfsmount *mnt;
        int error;
        struct pid_namespace *ns;
        struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
                acct = NULL;
        }
-        mnt_pin(file->f_path.mnt);
+        mnt = file->f_path.mnt;
+        mnt_pin(mnt);
        acct_file_reopen(ns->bacct, file, ns);
        spin_unlock(&acct_lock);
-        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+        mntput(mnt); /* it's pinned, now give up active reference */
        kfree(acct);
        return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 /* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
                kfree_skb(skb);
 }
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+        struct nlmsghdr *nlh = nlmsg_hdr(skb);
+        char *data = NLMSG_DATA(nlh);
+        if (nlh->nlmsg_type != AUDIT_EOE) {
+                if (printk_ratelimit())
+                        printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+                else
+                        audit_log_lost("printk limit exceeded\n");
+        }
+        audit_hold_skb(skb);
+}
 static void kauditd_send_skb(struct sk_buff *skb)
 {
        int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
                if (skb) {
                        if (audit_pid)
                                kauditd_send_skb(skb);
-                        else {
+                        else
-                                if (printk_ratelimit())
+                                audit_printk_skb(skb);
-                                        printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
-                                else
-                                        audit_log_lost("printk limit exceeded\n");
-                                audit_hold_skb(skb);
-                        }
                } else {
                        DECLARE_WAITQUEUE(wait, current);
                        set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
        return 0;
 }
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
-        mutex_lock(&audit_cmd_mutex);
-        audit_prune_trees();
-        mutex_unlock(&audit_cmd_mutex);
-        return 0;
-}
-void audit_schedule_prune(void)
-{
-        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
                                 int multi, void *payload, int size)
 {
        struct sk_buff  *skb;
        struct nlmsghdr *nlh;
-        int             len = NLMSG_SPACE(size);
        void            *data;
        int             flags = multi ? NLM_F_MULTI : 0;
        int             t     = done  ? NLMSG_DONE  : type;
-        skb = alloc_skb(len, GFP_KERNEL);
+        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return NULL;
-        nlh              = NLMSG_PUT(skb, pid, seq, t, size);
+        nlh     = NLMSG_NEW(skb, pid, seq, t, size, flags);
-        nlh->nlmsg_flags = flags;
+        data    = NLMSG_DATA(nlh);
-        data             = NLMSG_DATA(nlh);
        memcpy(data, payload, size);
        return skb;
-nlmsg_failure:                  /* Used by NLMSG_PUT */
+nlmsg_failure:                  /* Used by NLMSG_NEW */
        if (skb)
                kfree_skb(skb);
        return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 }
 /*
- * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * Get message from skb.  Each message is processed by audit_receive_msg.
- * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * Malformed skbs with wrong length are discarded silently.
- * discarded silently.
 */
 static void audit_receive_skb(struct sk_buff *skb)
 {
-        int             err;
+        struct nlmsghdr *nlh;
-        struct nlmsghdr *nlh;
+        /*
-        u32             rlen;
+         * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+         * if the nlmsg_len was not aligned
+         */
+        int len;
+        int err;
-        while (skb->len >= NLMSG_SPACE(0)) {
+        nlh = nlmsg_hdr(skb);
-                nlh = nlmsg_hdr(skb);
+        len = skb->len;
-                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-                        return;
+        while (NLMSG_OK(nlh, len)) {
-                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                err = audit_receive_msg(skb, nlh);
-                if (rlen > skb->len)
+                /* if err or if this message says it wants a response */
-                        rlen = skb->len;
+                if (err || (nlh->nlmsg_flags & NLM_F_ACK))
-                if ((err = audit_receive_msg(skb, nlh))) {
                        netlink_ack(skb, nlh, err);
-                } else if (nlh->nlmsg_flags & NLM_F_ACK)
-                        netlink_ack(skb, nlh, 0);
+                nlh = NLMSG_NEXT(nlh, len);
-                skb_pull(skb, rlen);
        }
 }
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff  *skb)
        mutex_unlock(&audit_cmd_mutex);
 }
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
-        .handle_event   = audit_handle_ievent,
-        .destroy_watch  = audit_free_parent,
-};
-#endif
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
@@ -991,12 +978,6 @@ static int __init audit_init(void)
        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-#ifdef CONFIG_AUDITSYSCALL
-        audit_ih = inotify_init(&audit_inotify_ops);
-        if (IS_ERR(audit_ih))
-                audit_panic("cannot initialize inotify handle");
-#endif
        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
                        goto err;
        }
-        ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
-        if (!ab->skb)
-                goto err;
        ab->ctx = ctx;
        ab->gfp_mask = gfp_mask;
-        nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-        nlh->nlmsg_type = type;
+        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
-        nlh->nlmsg_flags = 0;
+        if (!ab->skb)
-        nlh->nlmsg_pid = 0;
+                goto nlmsg_failure;
-        nlh->nlmsg_seq = 0;
+        nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
        return ab;
+nlmsg_failure:                  /* Used by NLMSG_NEW */
+        kfree_skb(ab->skb);
+        ab->skb = NULL;
 err:
        audit_buffer_free(ab);
        return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        kfree(pathname);
 }
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+        audit_log_format(ab, " key=");
+        if (key)
+                audit_log_untrustedstring(ab, key);
+        else
+                audit_log_format(ab, "(null)");
+}
 /**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
                        skb_queue_tail(&audit_skb_queue, ab->skb);
                        wake_up_interruptible(&kauditd_wait);
                } else {
-                        if (nlh->nlmsg_type != AUDIT_EOE) {
+                        audit_printk_skb(ab->skb);
-                                if (printk_ratelimit()) {
-                                        printk(KERN_NOTICE "type=%d %s\n",
-                                                nlh->nlmsg_type,
-                                                ab->skb->data + NLMSG_SPACE(0));
-                                } else
-                                        audit_log_lost("printk limit exceeded\n");
-                        }
-                        audit_hold_skb(ab->skb);
                }
                ab->skb = NULL;
        }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
 };
 /* Rule lists */
-struct audit_parent;
+struct audit_watch;
-struct audit_watch {
-        atomic_t                count;  /* reference count */
-        char                    *path;  /* insertion path */
-        dev_t                   dev;    /* associated superblock device */
-        unsigned long           ino;    /* associated inode number */
-        struct audit_parent     *parent; /* associated parent */
-        struct list_head        wlist;  /* entry in parent->watches list */
-        struct list_head        rules;  /* associated rules */
-};
 struct audit_tree;
 struct audit_chunk;
@@ -108,19 +97,28 @@ struct audit_netlink_list {
 int audit_send_list(void *);
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
-                                const char *, struct inode *);
 extern int selinux_audit_rule_update(void);
 extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
+/* audit watch functions */
+extern unsigned long audit_watch_inode(struct audit_watch *watch);
+extern dev_t audit_watch_dev(struct audit_watch *watch);
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule);
+extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
+extern void audit_inotify_unregister(struct list_head *in_list);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+                                           struct audit_watch *watch);
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
 extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
 extern int audit_remove_tree_rule(struct audit_krule *);
 extern void audit_trim_trees(void);
 extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
 extern const char *audit_tree_path(struct audit_tree *);
 extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
 #else
 #define audit_remove_tree_rule(rule) BUG()
 #define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
 #define audit_put_tree(tree) (void)0
 #define audit_tag_tree(old, new) -EINVAL
 #define audit_tree_path(rule) ""        /* never called */
+#define audit_kill_trees(list) BUG()
 #endif
 extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
        return 0;
 }
 extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
 #endif
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a82..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
 #include <linux/inotify.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/kthread.h>
 struct audit_tree;
 struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
                if (rule->tree) {
                        /* not a half-baked one */
                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "op=remove rule dir=");
+                        audit_log_format(ab, "op=");
+                        audit_log_string(ab, "remove rule");
+                        audit_log_format(ab, " dir=");
                        audit_log_untrustedstring(ab, rule->tree->pathname);
-                        if (rule->filterkey) {
+                        audit_log_key(ab, rule->filterkey);
-                                audit_log_format(ab, " key=");
-                                audit_log_untrustedstring(ab, rule->filterkey);
-                        } else
-                                audit_log_format(ab, " key=(null)");
                        audit_log_format(ab, " list=%d res=1", rule->listnr);
                        audit_log_end(ab);
                        rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
        }
 }
+static void audit_schedule_prune(void);
 /* called with audit_filter_mutex */
 int audit_remove_tree_rule(struct audit_krule *rule)
 {
@@ -568,7 +569,7 @@ void audit_trim_trees(void)
                if (err)
                        goto skip_it;
-                root_mnt = collect_mounts(path.mnt, path.dentry);
+                root_mnt = collect_mounts(&path);
                path_put(&path);
                if (!root_mnt)
                        goto skip_it;
@@ -660,7 +661,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
        err = kern_path(tree->pathname, 0, &path);
        if (err)
                goto Err;
-        mnt = collect_mounts(path.mnt, path.dentry);
+        mnt = collect_mounts(&path);
        path_put(&path);
        if (!mnt) {
                err = -ENOMEM;
@@ -720,7 +721,7 @@ int audit_tag_tree(char *old, char *new)
        err = kern_path(new, 0, &path);
        if (err)
                return err;
-        tagged = collect_mounts(path.mnt, path.dentry);
+        tagged = collect_mounts(&path);
        path_put(&path);
        if (!tagged)
                return -ENOMEM;
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
 /*
 * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
 */
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
 {
+        mutex_lock(&audit_cmd_mutex);
        mutex_lock(&audit_filter_mutex);
        while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
        }
        mutex_unlock(&audit_filter_mutex);
+        mutex_unlock(&audit_cmd_mutex);
+        return 0;
+}
+static void audit_schedule_prune(void)
+{
+        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall.  Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+        mutex_lock(&audit_cmd_mutex);
+        mutex_lock(&audit_filter_mutex);
+        while (!list_empty(list)) {
+                struct audit_tree *victim;
+                victim = list_entry(list->next, struct audit_tree, list);
+                kill_rules(victim);
+                list_del_init(&victim->list);
+                mutex_unlock(&audit_filter_mutex);
+                prune_one(victim);
+                mutex_lock(&audit_filter_mutex);
+        }
+        mutex_unlock(&audit_filter_mutex);
+        mutex_unlock(&audit_cmd_mutex);
 }
 /*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
 static void evict_chunk(struct audit_chunk *chunk)
 {
        struct audit_tree *owner;
+        struct list_head *postponed = audit_killed_trees();
+        int need_prune = 0;
        int n;
        if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
                owner->root = NULL;
                list_del_init(&owner->same_root);
                spin_unlock(&hash_lock);
-                kill_rules(owner);
+                if (!postponed) {
-                list_move(&owner->list, &prune_list);
+                        kill_rules(owner);
-                audit_schedule_prune();
+                        list_move(&owner->list, &prune_list);
+                        need_prune = 1;
+                } else {
+                        list_move(&owner->list, postponed);
+                }
                spin_lock(&hash_lock);
        }
        list_del_rcu(&chunk->hash);
        for (n = 0; n < chunk->count; n++)
                list_del_init(&chunk->owners[n].list);
        spin_unlock(&hash_lock);
+        if (need_prune)
+                audit_schedule_prune();
        mutex_unlock(&audit_filter_mutex);
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ *      event.  Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ *      audit_remove_watch().  Additionally, an audit_watch may exist
+ *      temporarily to assist in searching existing filter data.  Each
+ *      audit_krule holds a reference to its associated watch.
+ */
+struct audit_watch {
+        atomic_t                count;  /* reference count */
+        char                    *path;  /* insertion path */
+        dev_t                   dev;    /* associated superblock device */
+        unsigned long           ino;    /* associated inode number */
+        struct audit_parent     *parent; /* associated parent */
+        struct list_head        wlist;  /* entry in parent->watches list */
+        struct list_head        rules;  /* associated rules */
+};
+struct audit_parent {
+        struct list_head        ilist;  /* entry in inotify registration list */
+        struct list_head        watches; /* associated watches */
+        struct inotify_watch    wdata;  /* inotify watch data */
+        unsigned                flags;  /* status flags */
+};
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID    0x001
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct inotify_watch *i_watch)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        WARN_ON(!list_empty(&parent->watches));
+        kfree(parent);
+}
+void audit_get_watch(struct audit_watch *watch)
+{
+        atomic_inc(&watch->count);
+}
+void audit_put_watch(struct audit_watch *watch)
+{
+        if (atomic_dec_and_test(&watch->count)) {
+                WARN_ON(watch->parent);
+                WARN_ON(!list_empty(&watch->rules));
+                kfree(watch->path);
+                kfree(watch);
+        }
+}
+void audit_remove_watch(struct audit_watch *watch)
+{
+        list_del(&watch->wlist);
+        put_inotify_watch(&watch->parent->wdata);
+        watch->parent = NULL;
+        audit_put_watch(watch); /* match initial get */
+}
+char *audit_watch_path(struct audit_watch *watch)
+{
+        return watch->path;
+}
+struct list_head *audit_watch_rules(struct audit_watch *watch)
+{
+        return &watch->rules;
+}
+unsigned long audit_watch_inode(struct audit_watch *watch)
+{
+        return watch->ino;
+}
+dev_t audit_watch_dev(struct audit_watch *watch)
+{
+        return watch->dev;
+}
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+        struct audit_parent *parent;
+        s32 wd;
+        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+        if (unlikely(!parent))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&parent->watches);
+        parent->flags = 0;
+        inotify_init_watch(&parent->wdata);
+        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+        get_inotify_watch(&parent->wdata);
+        wd = inotify_add_watch(audit_ih, &parent->wdata,
+                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+        if (wd < 0) {
+                audit_free_parent(&parent->wdata);
+                return ERR_PTR(wd);
+        }
+        return parent;
+}
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+        struct audit_watch *watch;
+        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+        if (unlikely(!watch))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&watch->rules);
+        atomic_set(&watch->count, 1);
+        watch->path = path;
+        watch->dev = (dev_t)-1;
+        watch->ino = (unsigned long)-1;
+        return watch;
+}
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+        struct audit_watch *watch;
+        if (!audit_ih)
+                return -EOPNOTSUPP;
+        if (path[0] != '/' || path[len-1] == '/' ||
+            krule->listnr != AUDIT_FILTER_EXIT ||
+            op != Audit_equal ||
+            krule->inode_f || krule->watch || krule->tree)
+                return -EINVAL;
+        watch = audit_init_watch(path);
+        if (IS_ERR(watch))
+                return PTR_ERR(watch);
+        audit_get_watch(watch);
+        krule->watch = watch;
+        return 0;
+}
+/* Duplicate the given audit watch.  The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+        char *path;
+        struct audit_watch *new;
+        path = kstrdup(old->path, GFP_KERNEL);
+        if (unlikely(!path))
+                return ERR_PTR(-ENOMEM);
+        new = audit_init_watch(path);
+        if (IS_ERR(new)) {
+                kfree(path);
+                goto out;
+        }
+        new->dev = old->dev;
+        new->ino = old->ino;
+        get_inotify_watch(&old->parent->wdata);
+        new->parent = old->parent;
+out:
+        return new;
+}
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+        if (audit_enabled) {
+                struct audit_buffer *ab;
+                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+                audit_log_format(ab, "auid=%u ses=%u op=",
+                                 audit_get_loginuid(current),
+                                 audit_get_sessionid(current));
+                audit_log_string(ab, op);
+                audit_log_format(ab, " path=");
+                audit_log_untrustedstring(ab, w->path);
+                audit_log_key(ab, r->filterkey);
+                audit_log_format(ab, " list=%d res=1", r->listnr);
+                audit_log_end(ab);
+        }
+}
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+                               const char *dname, dev_t dev,
+                               unsigned long ino, unsigned invalidating)
+{
+        struct audit_watch *owatch, *nwatch, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *oentry, *nentry;
+        mutex_lock(&audit_filter_mutex);
+        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+                if (audit_compare_dname_path(dname, owatch->path, NULL))
+                        continue;
+                /* If the update involves invalidating rules, do the inode-based
+                 * filtering now, so we don't omit records. */
+                if (invalidating && current->audit_context)
+                        audit_filter_inodes(current, current->audit_context);
+                nwatch = audit_dupe_watch(owatch);
+                if (IS_ERR(nwatch)) {
+                        mutex_unlock(&audit_filter_mutex);
+                        audit_panic("error updating watch, skipping");
+                        return;
+                }
+                nwatch->dev = dev;
+                nwatch->ino = ino;
+                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+                        oentry = container_of(r, struct audit_entry, rule);
+                        list_del(&oentry->rule.rlist);
+                        list_del_rcu(&oentry->list);
+                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
+                        if (IS_ERR(nentry)) {
+                                list_del(&oentry->rule.list);
+                                audit_panic("error updating watch, removing");
+                        } else {
+                                int h = audit_hash_ino((u32)ino);
+                                list_add(&nentry->rule.rlist, &nwatch->rules);
+                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+                                list_replace(&oentry->rule.list,
+                                             &nentry->rule.list);
+                        }
+                        audit_watch_log_rule_change(r, owatch, "updated rules");
+                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
+                }
+                audit_remove_watch(owatch);
+                goto add_watch_to_parent; /* event applies to a single watch */
+        }
+        mutex_unlock(&audit_filter_mutex);
+        return;
+add_watch_to_parent:
+        list_add(&nwatch->wlist, &parent->watches);
+        mutex_unlock(&audit_filter_mutex);
+        return;
+}
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+        struct audit_watch *w, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *e;
+        mutex_lock(&audit_filter_mutex);
+        parent->flags |= AUDIT_PARENT_INVALID;
+        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+                        e = container_of(r, struct audit_entry, rule);
+                        audit_watch_log_rule_change(r, w, "remove rule");
+                        list_del(&r->rlist);
+                        list_del(&r->list);
+                        list_del_rcu(&e->list);
+                        call_rcu(&e->rcu, audit_free_rule_rcu);
+                }
+                audit_remove_watch(w);
+        }
+        mutex_unlock(&audit_filter_mutex);
+}
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+void audit_inotify_unregister(struct list_head *in_list)
+{
+        struct audit_parent *p, *n;
+        list_for_each_entry_safe(p, n, in_list, ilist) {
+                list_del(&p->ilist);
+                inotify_rm_watch(audit_ih, &p->wdata);
+                /* the unpin matching the pin in audit_do_del_rule() */
+                unpin_inotify_watch(&p->wdata);
+        }
+}
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+{
+        struct nameidata *ndparent, *ndwatch;
+        int err;
+        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+        if (unlikely(!ndparent))
+                return -ENOMEM;
+        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+        if (unlikely(!ndwatch)) {
+                kfree(ndparent);
+                return -ENOMEM;
+        }
+        err = path_lookup(path, LOOKUP_PARENT, ndparent);
+        if (err) {
+                kfree(ndparent);
+                kfree(ndwatch);
+                return err;
+        }
+        err = path_lookup(path, 0, ndwatch);
+        if (err) {
+                kfree(ndwatch);
+                ndwatch = NULL;
+        }
+        *ndp = ndparent;
+        *ndw = ndwatch;
+        return 0;
+}
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+        if (ndp) {
+                path_put(&ndp->path);
+                kfree(ndp);
+        }
+        if (ndw) {
+                path_put(&ndw->path);
+                kfree(ndw);
+        }
+}
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+                                struct audit_parent *parent)
+{
+        struct audit_watch *w, *watch = krule->watch;
+        int watch_found = 0;
+        list_for_each_entry(w, &parent->watches, wlist) {
+                if (strcmp(watch->path, w->path))
+                        continue;
+                watch_found = 1;
+                /* put krule's and initial refs to temporary watch */
+                audit_put_watch(watch);
+                audit_put_watch(watch);
+                audit_get_watch(w);
+                krule->watch = watch = w;
+                break;
+        }
+        if (!watch_found) {
+                get_inotify_watch(&parent->wdata);
+                watch->parent = parent;
+                list_add(&watch->wlist, &parent->watches);
+        }
+        list_add(&krule->rlist, &watch->rules);
+}
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule)
+{
+        struct audit_watch *watch = krule->watch;
+        struct inotify_watch *i_watch;
+        struct audit_parent *parent;
+        struct nameidata *ndp = NULL, *ndw = NULL;
+        int ret = 0;
+        mutex_unlock(&audit_filter_mutex);
+        /* Avoid calling path_lookup under audit_filter_mutex. */
+        ret = audit_get_nd(watch->path, &ndp, &ndw);
+        if (ret) {
+                /* caller expects mutex locked */
+                mutex_lock(&audit_filter_mutex);
+                goto error;
+        }
+        /* update watch filter fields */
+        if (ndw) {
+                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+                watch->ino = ndw->path.dentry->d_inode->i_ino;
+        }
+        /* The audit_filter_mutex must not be held during inotify calls because
+         * we hold it during inotify event callback processing.  If an existing
+         * inotify watch is found, inotify_find_watch() grabs a reference before
+         * returning.
+         */
+        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+                               &i_watch) < 0) {
+                parent = audit_init_parent(ndp);
+                if (IS_ERR(parent)) {
+                        /* caller expects mutex locked */
+                        mutex_lock(&audit_filter_mutex);
+                        ret = PTR_ERR(parent);
+                        goto error;
+                }
+        } else
+                parent = container_of(i_watch, struct audit_parent, wdata);
+        mutex_lock(&audit_filter_mutex);
+        /* parent was moved before we took audit_filter_mutex */
+        if (parent->flags & AUDIT_PARENT_INVALID)
+                ret = -ENOENT;
+        else
+                audit_add_to_parent(krule, parent);
+        /* match get in audit_init_parent or inotify_find_watch */
+        put_inotify_watch(&parent->wdata);
+error:
+        audit_put_nd(ndp, ndw);         /* NULL args OK */
+        return ret;
+}
+void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+{
+        struct audit_watch *watch = krule->watch;
+        struct audit_parent *parent = watch->parent;
+        list_del(&krule->rlist);
+        if (list_empty(&watch->rules)) {
+                audit_remove_watch(watch);
+                if (list_empty(&parent->watches)) {
+                        /* Put parent on the inotify un-registration
+                         * list.  Grab a reference before releasing
+                         * audit_filter_mutex, to be released in
+                         * audit_inotify_unregister().
+                         * If filesystem is going away, just leave
+                         * the sucker alone, eviction will take
+                         * care of it. */
+                        if (pin_inotify_watch(&parent->wdata))
+                                list_add(&parent->ilist, list);
+                }
+        }
+}
+/* Update watch data in audit rules based on inotify events. */
+static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+                         u32 cookie, const char *dname, struct inode *inode)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+                audit_update_watch(parent, dname, inode->i_sb->s_dev,
+                                   inode->i_ino, 0);
+        else if (mask & (IN_DELETE|IN_MOVED_FROM))
+                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+        /* inotify automatically removes the watch and sends IN_IGNORED */
+        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+                audit_remove_parent_watches(parent);
+        /* inotify does not remove the watch, so remove it manually */
+        else if(mask & IN_MOVE_SELF) {
+                audit_remove_parent_watches(parent);
+                inotify_remove_watch_locked(audit_ih, i_watch);
+        } else if (mask & IN_IGNORED)
+                put_inotify_watch(i_watch);
+}
+static const struct inotify_operations audit_inotify_ops = {
+        .handle_event   = audit_handle_ievent,
+        .destroy_watch  = audit_free_parent,
+};
+static int __init audit_watch_init(void)
+{
+        audit_ih = inotify_init(&audit_inotify_ops);
+        if (IS_ERR(audit_ih))
+                audit_panic("cannot initialize inotify handle");
+        return 0;
+}
+subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
-#include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
@@ -44,36 +43,6 @@
 *              be written directly provided audit_filter_mutex is held.
 */
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- *      event.  Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- *      audit_remove_watch().  Additionally, an audit_watch may exist
- *      temporarily to assist in searching existing filter data.  Each
- *      audit_krule holds a reference to its associated watch.
- */
-struct audit_parent {
-        struct list_head        ilist;  /* entry in inotify registration list */
-        struct list_head        watches; /* associated watches */
-        struct inotify_watch    wdata;  /* inotify watch data */
-        unsigned                flags;  /* status flags */
-};
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID    0x001
 /* Audit filter lists, defined in <linux/audit.h> */
 struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-void audit_free_parent(struct inotify_watch *i_watch)
-{
-        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
-        WARN_ON(!list_empty(&parent->watches));
-        kfree(parent);
-}
-static inline void audit_get_watch(struct audit_watch *watch)
-{
-        atomic_inc(&watch->count);
-}
-static void audit_put_watch(struct audit_watch *watch)
-{
-        if (atomic_dec_and_test(&watch->count)) {
-                WARN_ON(watch->parent);
-                WARN_ON(!list_empty(&watch->rules));
-                kfree(watch->path);
-                kfree(watch);
-        }
-}
-static void audit_remove_watch(struct audit_watch *watch)
-{
-        list_del(&watch->wlist);
-        put_inotify_watch(&watch->parent->wdata);
-        watch->parent = NULL;
-        audit_put_watch(watch); /* match initial get */
-}
 static inline void audit_free_rule(struct audit_entry *e)
 {
        int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
        audit_free_rule(e);
 }
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
-        struct audit_parent *parent;
-        s32 wd;
-        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
-        if (unlikely(!parent))
-                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&parent->watches);
-        parent->flags = 0;
-        inotify_init_watch(&parent->wdata);
-        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
-        get_inotify_watch(&parent->wdata);
-        wd = inotify_add_watch(audit_ih, &parent->wdata,
-                               ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
-        if (wd < 0) {
-                audit_free_parent(&parent->wdata);
-                return ERR_PTR(wd);
-        }
-        return parent;
-}
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
-        struct audit_watch *watch;
-        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
-        if (unlikely(!watch))
-                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&watch->rules);
-        atomic_set(&watch->count, 1);
-        watch->path = path;
-        watch->dev = (dev_t)-1;
-        watch->ino = (unsigned long)-1;
-        return watch;
-}
 /* Initialize an audit filterlist entry. */
 static inline struct audit_entry *audit_init_entry(u32 field_count)
 {
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
        return 0;
 }
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
-                          u32 op)
-{
-        struct audit_watch *watch;
-        if (!audit_ih)
-                return -EOPNOTSUPP;
-        if (path[0] != '/' || path[len-1] == '/' ||
-            krule->listnr != AUDIT_FILTER_EXIT ||
-            op != Audit_equal ||
-            krule->inode_f || krule->watch || krule->tree)
-                return -EINVAL;
-        watch = audit_init_watch(path);
-        if (IS_ERR(watch))
-                return PTR_ERR(watch);
-        audit_get_watch(watch);
-        krule->watch = watch;
-        return 0;
-}
 static __u32 *classes[AUDIT_SYSCALL_CLASSES];
 int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        break;
                case AUDIT_WATCH:
                        data->buflen += data->values[i] =
-                                audit_pack_string(&bufp, krule->watch->path);
+                                audit_pack_string(&bufp,
+                                                  audit_watch_path(krule->watch));
                        break;
                case AUDIT_DIR:
                        data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                                return 1;
                        break;
                case AUDIT_WATCH:
-                        if (strcmp(a->watch->path, b->watch->path))
+                        if (strcmp(audit_watch_path(a->watch),
+                                   audit_watch_path(b->watch)))
                                return 1;
                        break;
                case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
        return 0;
 }
-/* Duplicate the given audit watch.  The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
-        char *path;
-        struct audit_watch *new;
-        path = kstrdup(old->path, GFP_KERNEL);
-        if (unlikely(!path))
-                return ERR_PTR(-ENOMEM);
-        new = audit_init_watch(path);
-        if (IS_ERR(new)) {
-                kfree(path);
-                goto out;
-        }
-        new->dev = old->dev;
-        new->ino = old->ino;
-        get_inotify_watch(&old->parent->wdata);
-        new->parent = old->parent;
-out:
-        return new;
-}
 /* Duplicate LSM field information.  The lsm_rule is opaque, so must be
 * re-initialized. */
 static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
 * rule with the new rule in the filterlist, then free the old rule.
 * The rlist element is undefined; list manipulations are handled apart from
 * the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
-                                           struct audit_watch *watch)
+                                    struct audit_watch *watch)
 {
        u32 fcount = old->field_count;
        struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
        return entry;
 }
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
-                               const char *dname, dev_t dev,
-                               unsigned long ino, unsigned invalidating)
-{
-        struct audit_watch *owatch, *nwatch, *nextw;
-        struct audit_krule *r, *nextr;
-        struct audit_entry *oentry, *nentry;
-        mutex_lock(&audit_filter_mutex);
-        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-                if (audit_compare_dname_path(dname, owatch->path, NULL))
-                        continue;
-                /* If the update involves invalidating rules, do the inode-based
-                 * filtering now, so we don't omit records. */
-                if (invalidating && current->audit_context)
-                        audit_filter_inodes(current, current->audit_context);
-                nwatch = audit_dupe_watch(owatch);
-                if (IS_ERR(nwatch)) {
-                        mutex_unlock(&audit_filter_mutex);
-                        audit_panic("error updating watch, skipping");
-                        return;
-                }
-                nwatch->dev = dev;
-                nwatch->ino = ino;
-                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-                        oentry = container_of(r, struct audit_entry, rule);
-                        list_del(&oentry->rule.rlist);
-                        list_del_rcu(&oentry->list);
-                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
-                        if (IS_ERR(nentry)) {
-                                list_del(&oentry->rule.list);
-                                audit_panic("error updating watch, removing");
-                        } else {
-                                int h = audit_hash_ino((u32)ino);
-                                list_add(&nentry->rule.rlist, &nwatch->rules);
-                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
-                                list_replace(&oentry->rule.list,
-                                             &nentry->rule.list);
-                        }
-                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
-                }
-                if (audit_enabled) {
-                        struct audit_buffer *ab;
-                        ab = audit_log_start(NULL, GFP_NOFS,
-                                AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "auid=%u ses=%u",
-                                audit_get_loginuid(current),
-                                audit_get_sessionid(current));
-                        audit_log_format(ab,
-                                " op=updated rules specifying path=");
-                        audit_log_untrustedstring(ab, owatch->path);
-                        audit_log_format(ab, " with dev=%u ino=%lu\n",
-                                 dev, ino);
-                        audit_log_format(ab, " list=%d res=1", r->listnr);
-                        audit_log_end(ab);
-                }
-                audit_remove_watch(owatch);
-                goto add_watch_to_parent; /* event applies to a single watch */
-        }
-        mutex_unlock(&audit_filter_mutex);
-        return;
-add_watch_to_parent:
-        list_add(&nwatch->wlist, &parent->watches);
-        mutex_unlock(&audit_filter_mutex);
-        return;
-}
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
-        struct audit_watch *w, *nextw;
-        struct audit_krule *r, *nextr;
-        struct audit_entry *e;
-        mutex_lock(&audit_filter_mutex);
-        parent->flags |= AUDIT_PARENT_INVALID;
-        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
-                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
-                        e = container_of(r, struct audit_entry, rule);
-                        if (audit_enabled) {
-                                struct audit_buffer *ab;
-                                ab = audit_log_start(NULL, GFP_NOFS,
-                                        AUDIT_CONFIG_CHANGE);
-                                audit_log_format(ab, "auid=%u ses=%u",
-                                        audit_get_loginuid(current),
-                                        audit_get_sessionid(current));
-                                audit_log_format(ab, " op=remove rule path=");
-                                audit_log_untrustedstring(ab, w->path);
-                                if (r->filterkey) {
-                                        audit_log_format(ab, " key=");
-                                        audit_log_untrustedstring(ab,
-                                                        r->filterkey);
-                                } else
-                                        audit_log_format(ab, " key=(null)");
-                                audit_log_format(ab, " list=%d res=1",
-                                        r->listnr);
-                                audit_log_end(ab);
-                        }
-                        list_del(&r->rlist);
-                        list_del(&r->list);
-                        list_del_rcu(&e->list);
-                        call_rcu(&e->rcu, audit_free_rule_rcu);
-                }
-                audit_remove_watch(w);
-        }
-        mutex_unlock(&audit_filter_mutex);
-}
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
-        struct audit_parent *p, *n;
-        list_for_each_entry_safe(p, n, in_list, ilist) {
-                list_del(&p->ilist);
-                inotify_rm_watch(audit_ih, &p->wdata);
-                /* the unpin matching the pin in audit_do_del_rule() */
-                unpin_inotify_watch(&p->wdata);
-        }
-}
 /* Find an existing audit rule.
 * Caller must hold audit_filter_mutex to prevent stale rule data. */
 static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
        return found;
 }
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
-                        struct nameidata **ndw)
-{
-        struct nameidata *ndparent, *ndwatch;
-        int err;
-        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
-        if (unlikely(!ndparent))
-                return -ENOMEM;
-        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
-        if (unlikely(!ndwatch)) {
-                kfree(ndparent);
-                return -ENOMEM;
-        }
-        err = path_lookup(path, LOOKUP_PARENT, ndparent);
-        if (err) {
-                kfree(ndparent);
-                kfree(ndwatch);
-                return err;
-        }
-        err = path_lookup(path, 0, ndwatch);
-        if (err) {
-                kfree(ndwatch);
-                ndwatch = NULL;
-        }
-        *ndp = ndparent;
-        *ndw = ndwatch;
-        return 0;
-}
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-        if (ndp) {
-                path_put(&ndp->path);
-                kfree(ndp);
-        }
-        if (ndw) {
-                path_put(&ndw->path);
-                kfree(ndw);
-        }
-}
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
-                                struct audit_parent *parent)
-{
-        struct audit_watch *w, *watch = krule->watch;
-        int watch_found = 0;
-        list_for_each_entry(w, &parent->watches, wlist) {
-                if (strcmp(watch->path, w->path))
-                        continue;
-                watch_found = 1;
-                /* put krule's and initial refs to temporary watch */
-                audit_put_watch(watch);
-                audit_put_watch(watch);
-                audit_get_watch(w);
-                krule->watch = watch = w;
-                break;
-        }
-        if (!watch_found) {
-                get_inotify_watch(&parent->wdata);
-                watch->parent = parent;
-                list_add(&watch->wlist, &parent->watches);
-        }
-        list_add(&krule->rlist, &watch->rules);
-}
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
-                           struct nameidata *ndw)
-{
-        struct audit_watch *watch = krule->watch;
-        struct inotify_watch *i_watch;
-        struct audit_parent *parent;
-        int ret = 0;
-        /* update watch filter fields */
-        if (ndw) {
-                watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-                watch->ino = ndw->path.dentry->d_inode->i_ino;
-        }
-        /* The audit_filter_mutex must not be held during inotify calls because
-         * we hold it during inotify event callback processing.  If an existing
-         * inotify watch is found, inotify_find_watch() grabs a reference before
-         * returning.
-         */
-        mutex_unlock(&audit_filter_mutex);
-        if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-                               &i_watch) < 0) {
-                parent = audit_init_parent(ndp);
-                if (IS_ERR(parent)) {
-                        /* caller expects mutex locked */
-                        mutex_lock(&audit_filter_mutex);
-                        return PTR_ERR(parent);
-                }
-        } else
-                parent = container_of(i_watch, struct audit_parent, wdata);
-        mutex_lock(&audit_filter_mutex);
-        /* parent was moved before we took audit_filter_mutex */
-        if (parent->flags & AUDIT_PARENT_INVALID)
-                ret = -ENOENT;
-        else
-                audit_add_to_parent(krule, parent);
-        /* match get in audit_init_parent or inotify_find_watch */
-        put_inotify_watch(&parent->wdata);
-        return ret;
-}
 static u64 prio_low = ~0ULL/2;
 static u64 prio_high = ~0ULL/2 - 1;
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
        struct audit_entry *e;
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
-        struct nameidata *ndp = NULL, *ndw = NULL;
        struct list_head *list;
        int h, err;
 #ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
-        mutex_unlock(&audit_filter_mutex);
        if (e) {
+                mutex_unlock(&audit_filter_mutex);
                err = -EEXIST;
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
                goto error;
        }
-        /* Avoid calling path_lookup under audit_filter_mutex. */
-        if (watch) {
-                err = audit_get_nd(watch->path, &ndp, &ndw);
-                if (err)
-                        goto error;
-        }
-        mutex_lock(&audit_filter_mutex);
        if (watch) {
                /* audit_filter_mutex is dropped and re-taken during this call */
-                err = audit_add_watch(&entry->rule, ndp, ndw);
+                err = audit_add_watch(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        goto error;
                }
-                h = audit_hash_ino((u32)watch->ino);
+                /* entry->rule.watch may have changed during audit_add_watch() */
+                watch = entry->rule.watch;
+                h = audit_hash_ino((u32)audit_watch_inode(watch));
                list = &audit_inode_hash[h];
        }
        if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #endif
        mutex_unlock(&audit_filter_mutex);
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
        return 0;
 error:
-        audit_put_nd(ndp, ndw);         /* NULL args OK */
        if (watch)
                audit_put_watch(watch); /* tmp watch, matches initial get */
        return err;
@@ -1372,7 +945,7 @@ error:
 static inline int audit_del_rule(struct audit_entry *entry)
 {
        struct audit_entry  *e;
-        struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
                goto out;
        }
-        watch = e->rule.watch;
+        if (e->rule.watch)
-        if (watch) {
+                audit_remove_watch_rule(&e->rule, &inotify_list);
-                struct audit_parent *parent = watch->parent;
-                list_del(&e->rule.rlist);
-                if (list_empty(&watch->rules)) {
-                        audit_remove_watch(watch);
-                        if (list_empty(&parent->watches)) {
-                                /* Put parent on the inotify un-registration
-                                 * list.  Grab a reference before releasing
-                                 * audit_filter_mutex, to be released in
-                                 * audit_inotify_unregister().
-                                 * If filesystem is going away, just leave
-                                 * the sucker alone, eviction will take
-                                 * care of it.
-                                 */
-                                if (pin_inotify_watch(&parent->wdata))
-                                        list_add(&parent->ilist, &inotify_list);
-                        }
-                }
-        }
        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
                audit_inotify_unregister(&inotify_list);
 out:
-        if (tmp_watch)
+        if (watch)
-                audit_put_watch(tmp_watch); /* match initial get */
+                audit_put_watch(watch); /* match initial get */
        if (tree)
                audit_put_tree(tree);   /* that's the temporary one */
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
                        security_release_secctx(ctx, len);
                }
        }
-        audit_log_format(ab, " op=%s rule key=", action);
+        audit_log_format(ab, " op=");
-        if (rule->filterkey)
+        audit_log_string(ab, action);
-                audit_log_untrustedstring(ab, rule->filterkey);
+        audit_log_key(ab, rule->filterkey);
-        else
-                audit_log_format(ab, "(null)");
        audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
        audit_log_end(ab);
 }
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                        return PTR_ERR(entry);
                err = audit_add_rule(entry);
-                audit_log_rule_change(loginuid, sessionid, sid, "add",
+                audit_log_rule_change(loginuid, sessionid, sid, "add rule",
                                      &entry->rule, !err);
                if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                        return PTR_ERR(entry);
                err = audit_del_rule(entry);
-                audit_log_rule_change(loginuid, sessionid, sid, "remove",
+                audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
                                      &entry->rule, !err);
                audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
                list_del(&r->list);
        } else {
                if (watch) {
-                        list_add(&nentry->rule.rlist, &watch->rules);
+                        list_add(&nentry->rule.rlist, audit_watch_rules(watch));
                        list_del(&r->rlist);
                } else if (tree)
                        list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
        return err;
 }
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
-                         u32 cookie, const char *dname, struct inode *inode)
-{
-        struct audit_parent *parent;
-        parent = container_of(i_watch, struct audit_parent, wdata);
-        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
-                audit_update_watch(parent, dname, inode->i_sb->s_dev,
-                                   inode->i_ino, 0);
-        else if (mask & (IN_DELETE|IN_MOVED_FROM))
-                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-        /* inotify automatically removes the watch and sends IN_IGNORED */
-        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
-                audit_remove_parent_watches(parent);
-        /* inotify does not remove the watch, so remove it manually */
-        else if(mask & IN_MOVE_SELF) {
-                audit_remove_parent_watches(parent);
-                inotify_remove_watch_locked(audit_ih, i_watch);
-        } else if (mask & IN_IGNORED)
-                put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
        struct audit_tree_refs *trees, *first_trees;
        int tree_count;
+        struct list_head killed_trees;
        int type;
        union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_WATCH:
-                        if (name && rule->watch->ino != (unsigned long)-1)
+                        if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
-                                result = (name->dev == rule->watch->dev &&
+                                result = (name->dev == audit_watch_dev(rule->watch) &&
-                                          name->ino == rule->watch->ino);
+                                          name->ino == audit_watch_inode(rule->watch));
                        break;
                case AUDIT_DIR:
                        if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
        if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
                return NULL;
        audit_zero_context(context, state);
+        INIT_LIST_HEAD(&context->killed_trees);
        return context;
 }
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
        char arg_num_len_buf[12];
        const char __user *tmp_p = p;
-        /* how many digits are in arg_num? 3 is the length of " a=" */
+        /* how many digits are in arg_num? 5 is the length of ' a=""' */
-        size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+        size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
        size_t len, len_left, to_send;
        size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
        unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
                if (has_cntl)
                        audit_log_n_hex(*ab, buf, to_send);
                else
-                        audit_log_format(*ab, "\"%s\"", buf);
+                        audit_log_string(*ab, buf);
                p += to_send;
                len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        audit_log_task_info(ab, tsk);
-        if (context->filterkey) {
+        audit_log_key(ab, context->filterkey);
-                audit_log_format(ab, " key=");
-                audit_log_untrustedstring(ab, context->filterkey);
-        } else
-                audit_log_format(ab, " key=(null)");
        audit_log_end(ab);
        for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
        /* that can happen only if we are called from do_exit() */
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
                audit_log_exit(context, tsk);
+        if (!list_empty(&context->killed_trees))
+                audit_kill_trees(&context->killed_trees);
        audit_free_context(context);
 }
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
        context->in_syscall = 0;
        context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+        if (!list_empty(&context->killed_trees))
+                audit_kill_trees(&context->killed_trees);
        if (context->previous) {
                struct audit_context *new_context = context->previous;
                context->previous  = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
        audit_log_format(ab, " sig=%ld", signr);
        audit_log_end(ab);
 }
+struct list_head *audit_killed_trees(void)
+{
+        struct audit_context *ctx = current->audit_context;
+        if (likely(!ctx || !ctx->in_syscall))
+                return NULL;
+        return &ctx->killed_trees;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd3765..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,8 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
 #include <asm/atomic.h>
@@ -733,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 * reference to css->refcnt. In general, this refcnt is expected to goes down
 * to zero, soon.
 *
- * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
 */
 DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 {
-        if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+        if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
                wake_up_all(&cgroup_rmdir_waitq);
 }
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+        css_get(css);
+}
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+        cgroup_wakeup_rmdir_waiter(css->cgroup);
+        css_put(css);
+}
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -842,6 +856,11 @@ static int parse_cgroupfs_options(char *data,
                                     struct cgroup_sb_opts *opts)
 {
        char *token, *o = data ?: "all";
+        unsigned long mask = (unsigned long)-1;
+#ifdef CONFIG_CPUSETS
+        mask = ~(1UL << cpuset_subsys_id);
+#endif
        opts->subsys_bits = 0;
        opts->flags = 0;
@@ -886,6 +905,15 @@ static int parse_cgroupfs_options(char *data,
                }
        }
+        /*
+         * Option noprefix was introduced just for backward compatibility
+         * with the old cpuset, so we allow noprefix only if mounting just
+         * the cpuset subsystem.
+         */
+        if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
+            (opts->subsys_bits & mask))
+                return -EINVAL;
        /* We can't have an empty hierarchy */
        if (!opts->subsys_bits)
                return -EINVAL;
@@ -900,6 +928,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgroup_sb_opts opts;
+        lock_kernel();
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
@@ -927,6 +956,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        kfree(opts.release_agent);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+        unlock_kernel();
        return ret;
 }
@@ -943,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->css_sets);
        INIT_LIST_HEAD(&cgrp->release_list);
+        INIT_LIST_HEAD(&cgrp->pids_list);
        init_rwsem(&cgrp->pids_mutex);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1340,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
         * is no longer empty.
         */
-        cgroup_wakeup_rmdir_waiters(cgrp);
+        cgroup_wakeup_rmdir_waiter(cgrp);
        return 0;
 }
@@ -2184,12 +2215,30 @@ err:
        return ret;
 }
+/*
+ * Cache pids for all threads in the same pid namespace that are
+ * opening the same "tasks" file.
+ */
+struct cgroup_pids {
+        /* The node in cgrp->pids_list */
+        struct list_head list;
+        /* The cgroup those pids belong to */
+        struct cgroup *cgrp;
+        /* The namepsace those pids belong to */
+        struct pid_namespace *ns;
+        /* Array of process ids in the cgroup */
+        pid_t *tasks_pids;
+        /* How many files are using the this tasks_pids array */
+        int use_count;
+        /* Length of the current tasks_pids array */
+        int length;
+};
 static int cmppid(const void *a, const void *b)
 {
        return *(pid_t *)a - *(pid_t *)b;
 }
 /*
 * seq_file methods for the "tasks" file. The seq_file position is the
 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2204,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
         * after a seek to the start). Use a binary-search to find the
         * next pid to display, if any
         */
-        struct cgroup *cgrp = s->private;
+        struct cgroup_pids *cp = s->private;
+        struct cgroup *cgrp = cp->cgrp;
        int index = 0, pid = *pos;
        int *iter;
        down_read(&cgrp->pids_mutex);
        if (pid) {
-                int end = cgrp->pids_length;
+                int end = cp->length;
                while (index < end) {
                        int mid = (index + end) / 2;
-                        if (cgrp->tasks_pids[mid] == pid) {
+                        if (cp->tasks_pids[mid] == pid) {
                                index = mid;
                                break;
-                        } else if (cgrp->tasks_pids[mid] <= pid)
+                        } else if (cp->tasks_pids[mid] <= pid)
                                index = mid + 1;
                        else
                                end = mid;
                }
        }
        /* If we're off the end of the array, we're done */
-        if (index >= cgrp->pids_length)
+        if (index >= cp->length)
                return NULL;
        /* Update the abstract position to be the actual pid that we found */
-        iter = cgrp->tasks_pids + index;
+        iter = cp->tasks_pids + index;
        *pos = *iter;
        return iter;
 }
 static void cgroup_tasks_stop(struct seq_file *s, void *v)
 {
-        struct cgroup *cgrp = s->private;
+        struct cgroup_pids *cp = s->private;
+        struct cgroup *cgrp = cp->cgrp;
        up_read(&cgrp->pids_mutex);
 }
 static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
 {
-        struct cgroup *cgrp = s->private;
+        struct cgroup_pids *cp = s->private;
        int *p = v;
-        int *end = cgrp->tasks_pids + cgrp->pids_length;
+        int *end = cp->tasks_pids + cp->length;
        /*
         * Advance to the next pid in the array. If this goes off the
@@ -2269,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
        .show = cgroup_tasks_show,
 };
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void release_cgroup_pid_array(struct cgroup_pids *cp)
 {
+        struct cgroup *cgrp = cp->cgrp;
        down_write(&cgrp->pids_mutex);
-        BUG_ON(!cgrp->pids_use_count);
+        BUG_ON(!cp->use_count);
-        if (!--cgrp->pids_use_count) {
+        if (!--cp->use_count) {
-                kfree(cgrp->tasks_pids);
+                list_del(&cp->list);
-                cgrp->tasks_pids = NULL;
+                put_pid_ns(cp->ns);
-                cgrp->pids_length = 0;
+                kfree(cp->tasks_pids);
+                kfree(cp);
        }
        up_write(&cgrp->pids_mutex);
 }
 static int cgroup_tasks_release(struct inode *inode, struct file *file)
 {
-        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+        struct seq_file *seq;
+        struct cgroup_pids *cp;
        if (!(file->f_mode & FMODE_READ))
                return 0;
-        release_cgroup_pid_array(cgrp);
+        seq = file->private_data;
+        cp = seq->private;
+        release_cgroup_pid_array(cp);
        return seq_release(inode, file);
 }
@@ -2307,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
+        struct cgroup_pids *cp;
        pid_t *pidarray;
        int npids;
        int retval;
@@ -2333,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
         * array if necessary
         */
        down_write(&cgrp->pids_mutex);
-        kfree(cgrp->tasks_pids);
-        cgrp->tasks_pids = pidarray;
+        list_for_each_entry(cp, &cgrp->pids_list, list) {
-        cgrp->pids_length = npids;
+                if (ns == cp->ns)
-        cgrp->pids_use_count++;
+                        goto found;
+        }
+        cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+        if (!cp) {
+                up_write(&cgrp->pids_mutex);
+                kfree(pidarray);
+                return -ENOMEM;
+        }
+        cp->cgrp = cgrp;
+        cp->ns = ns;
+        get_pid_ns(ns);
+        list_add(&cp->list, &cgrp->pids_list);
+found:
+        kfree(cp->tasks_pids);
+        cp->tasks_pids = pidarray;
+        cp->length = npids;
+        cp->use_count++;
        up_write(&cgrp->pids_mutex);
        file->f_op = &cgroup_tasks_operations;
        retval = seq_open(file, &cgroup_tasks_seq_operations);
        if (retval) {
-                release_cgroup_pid_array(cgrp);
+                release_cgroup_pid_array(cp);
                return retval;
        }
-        ((struct seq_file *)file->private_data)->private = cgrp;
+        ((struct seq_file *)file->private_data)->private = cp;
        return 0;
 }
@@ -2679,33 +2756,42 @@ again:
        mutex_unlock(&cgroup_mutex);
        /*
+         * In general, subsystem has no css->refcnt after pre_destroy(). But
+         * in racy cases, subsystem may have to get css->refcnt after
+         * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+         * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+         * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+         * and subsystem's reference count handling. Please see css_get/put
+         * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+         */
+        set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+        /*
         * Call pre_destroy handlers of subsys. Notify subsystems
         * that rmdir() request comes.
         */
        ret = cgroup_call_pre_destroy(cgrp);
-        if (ret)
+        if (ret) {
+                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
                return ret;
+        }
        mutex_lock(&cgroup_mutex);
        parent = cgrp->parent;
        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
-        /*
-         * css_put/get is provided for subsys to grab refcnt to css. In typical
-         * case, subsystem has no reference after pre_destroy(). But, under
-         * hierarchy management, some *temporal* refcnt can be hold.
-         * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
-         * is really busy, it should return -EBUSY at pre_destroy(). wake_up
-         * is called when css_put() is called and refcnt goes down to 0.
-         */
-        set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
        if (!cgroup_clear_css_refs(cgrp)) {
                mutex_unlock(&cgroup_mutex);
-                schedule();
+                /*
+                 * Because someone may call cgroup_wakeup_rmdir_waiter() before
+                 * prepare_to_wait(), we need to check this flag.
+                 */
+                if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+                        schedule();
                finish_wait(&cgroup_rmdir_waitq, &wait);
                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
                if (signal_pending(current))
@@ -3277,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
-                cgroup_wakeup_rmdir_waiters(cgrp);
+                cgroup_wakeup_rmdir_waiter(cgrp);
        }
        rcu_read_unlock();
 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 }
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+                             struct compat_siginfo __user *uinfo)
+{
+        siginfo_t info;
+        if (copy_siginfo_from_user32(&info, uinfo))
+                return -EFAULT;
+        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
         * an ongoing cpu hotplug operation.
         */
        int refcount;
-} cpu_hotplug;
+} cpu_hotplug = {
+        .active_writer = NULL,
-void __init cpu_hotplug_init(void)
+        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-{
+        .refcount = 0,
-        cpu_hotplug.active_writer = NULL;
+};
-        mutex_init(&cpu_hotplug.lock);
-        cpu_hotplug.refcount = 0;
-}
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
        struct cpuset *parent;          /* my parent */
-        /*
-         * Copy of global cpuset_mems_generation as of the most
-         * recent time this cpuset changed its mems_allowed.
-         */
-        int mems_generation;
        struct fmeter fmeter;           /* memory_pressure filter */
        /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value.  Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement.  So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
 static struct cpuset top_cpuset = {
        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
 * If a task is only holding callback_mutex, then it has read-only
 * access to cpusets.
 *
- * The task_struct fields mems_allowed and mems_generation may only
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * be accessed in the context of that task, so require no locks.
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
        BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 }
-/**
+/*
- * cpuset_update_task_memory_state - update task memory placement
+ * update task's spread flag if cpuset's page/slab spread flag is set
- *
+ *
- * If the current tasks cpusets mems_allowed changed behind our
+ * Called with callback_mutex/cgroup_mutex held
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held.  May be
- * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation.  However this really only
- * matters on alpha systems using cpusets heavily.  If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant.  Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
 */
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
-void cpuset_update_task_memory_state(void)
+                                        struct task_struct *tsk)
 {
-        int my_cpusets_mem_gen;
+        if (is_spread_page(cs))
-        struct task_struct *tsk = current;
+                tsk->flags |= PF_SPREAD_PAGE;
-        struct cpuset *cs;
+        else
+                tsk->flags &= ~PF_SPREAD_PAGE;
-        rcu_read_lock();
+        if (is_spread_slab(cs))
-        my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
+                tsk->flags |= PF_SPREAD_SLAB;
-        rcu_read_unlock();
+        else
+                tsk->flags &= ~PF_SPREAD_SLAB;
-        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-                mutex_lock(&callback_mutex);
-                task_lock(tsk);
-                cs = task_cs(tsk); /* Maybe changed when task not locked */
-                guarantee_online_mems(cs, &tsk->mems_allowed);
-                tsk->cpuset_mems_generation = cs->mems_generation;
-                if (is_spread_page(cs))
-                        tsk->flags |= PF_SPREAD_PAGE;
-                else
-                        tsk->flags &= ~PF_SPREAD_PAGE;
-                if (is_spread_slab(cs))
-                        tsk->flags |= PF_SPREAD_SLAB;
-                else
-                        tsk->flags &= ~PF_SPREAD_SLAB;
-                task_unlock(tsk);
-                mutex_unlock(&callback_mutex);
-                mpol_rebind_task(tsk, &tsk->mems_allowed);
-        }
 }
 /*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
 *    migrating memory region.
- *
- *    We call cpuset_update_task_memory_state() before hacking
- *    our tasks mems_allowed, so that we are assured of being in
- *    sync with our tasks cpuset, and in particular, callbacks to
- *    cpuset_update_task_memory_state() from nested page allocations
- *    won't see any mismatch of our cpuset and task mems_generation
- *    values, so won't overwrite our hacked tasks mems_allowed
- *    nodemask.
 */
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
        struct task_struct *tsk = current;
-        cpuset_update_task_memory_state();
-        mutex_lock(&callback_mutex);
        tsk->mems_allowed = *to;
-        mutex_unlock(&callback_mutex);
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
-        mutex_lock(&callback_mutex);
        guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
-        mutex_unlock(&callback_mutex);
 }
 /*
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+                                        nodemask_t *newmems)
+{
+        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+        mpol_rebind_task(tsk, &tsk->mems_allowed);
+        mpol_rebind_task(tsk, newmems);
+        tsk->mems_allowed = *newmems;
+}
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
+        nodemask_t newmems;
+        cs = cgroup_cs(scan->cg);
+        guarantee_online_mems(cs, &newmems);
+        task_lock(p);
+        cpuset_change_task_nodemask(p, &newmems);
+        task_unlock(p);
        mm = get_task_mm(p);
        if (!mm)
                return;
-        cs = cgroup_cs(scan->cg);
        migrate = is_memory_migrate(cs);
        mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 /*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
+ * cpusets mems_allowed, and for each task in the cpuset,
- * task in the cpuset, rebind any vma mempolicies and if
+ * update mems_allowed and rebind task's mempolicy and any vma
- * the cpuset is marked 'memory_migrate', migrate the tasks
+ * mempolicies and if the cpuset is marked 'memory_migrate',
- * pages to the new memory.
+ * migrate the tasks pages to the new memory.
 *
 * Call with cgroup_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        mutex_lock(&callback_mutex);
        cs->mems_allowed = trialcs->mems_allowed;
-        cs->mems_generation = cpuset_mems_generation++;
        mutex_unlock(&callback_mutex);
        update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 }
 /*
+ * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * @tsk: task to be updated
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_flag(struct task_struct *tsk,
+                                struct cgroup_scanner *scan)
+{
+        cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+}
+/*
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+        struct cgroup_scanner scan;
+        scan.cg = cs->css.cgroup;
+        scan.test_task = NULL;
+        scan.process_task = cpuset_change_flag;
+        scan.heap = heap;
+        cgroup_scan_tasks(&scan);
+}
+/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:         the bit to update (see cpuset_flagbits_t)
 * cs:          the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
 {
        struct cpuset *trialcs;
-        int err;
        int balance_flag_changed;
+        int spread_flag_changed;
+        struct ptr_heap heap;
+        int err;
        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        if (err < 0)
                goto out;
+        err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (err < 0)
+                goto out;
        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));
+        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+                        || (is_spread_page(cs) != is_spread_page(trialcs)));
        mutex_lock(&callback_mutex);
        cs->flags = trialcs->flags;
        mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                async_rebuild_sched_domains();
+        if (spread_flag_changed)
+                update_tasks_flags(cs, &heap);
+        heap_free(&heap);
 out:
        free_trial_cpuset(trialcs);
        return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
+                to = node_possible_map;
        } else {
-                mutex_lock(&callback_mutex);
                guarantee_online_cpus(cs, cpus_attach);
-                mutex_unlock(&callback_mutex);
+                guarantee_online_mems(cs, &to);
        }
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        if (err)
                return;
+        task_lock(tsk);
+        cpuset_change_task_nodemask(tsk, &to);
+        task_unlock(tsk);
+        cpuset_update_task_spread_flag(cs, tsk);
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
        mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
-                cs->mems_generation = cpuset_mems_generation++;
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
-                cs->mems_generation = cpuset_mems_generation++;
                break;
        default:
                retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
        struct cpuset *parent;
        if (!cont->parent) {
-                /* This is early initialization for the top cgroup */
-                top_cpuset.mems_generation = cpuset_mems_generation++;
                return &top_cpuset.css;
        }
        parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
                return ERR_PTR(-ENOMEM);
        }
-        cpuset_update_task_memory_state();
        cs->flags = 0;
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
-        cs->mems_generation = cpuset_mems_generation++;
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        cpuset_update_task_memory_state();
        if (is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
        .early_init = 1,
 };
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-int __init cpuset_init_early(void)
-{
-        alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
-        top_cpuset.mems_generation = cpuset_mems_generation++;
-        return 0;
-}
 /**
 * cpuset_init - initialize cpusets at system boot
 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
 {
        int err = 0;
+        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+                BUG();
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        fmeter_init(&top_cpuset.fmeter);
-        top_cpuset.mems_generation = cpuset_mems_generation++;
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
        top_cpuset.relax_domain_level = -1;
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d707..1bb4d7e5d616 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
 /*
 * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
 */
 struct cred *prepare_exec_creds(void)
 {
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
-        mutex_init(&p->cred_exec_mutex);
+        mutex_init(&p->cred_guard_mutex);
        if (
 #ifdef CONFIG_KEYS
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
-#include <linux/mnt_namespace.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
@@ -48,7 +47,8 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <linux/perf_counter.h>
+#include <trace/events/sched.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -56,10 +56,6 @@
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
 static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p)
@@ -158,6 +154,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+#ifdef CONFIG_PERF_COUNTERS
+        WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
 }
@@ -174,6 +173,7 @@ repeat:
        atomic_dec(&__task_cred(p)->user->processes);
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
        tracehook_finish_release_task(p);
        __exit_signal(p);
@@ -374,9 +374,8 @@ static void set_special_pids(struct pid *pid)
 }
 /*
- * Let kernel threads use this to say that they
+ * Let kernel threads use this to say that they allow a certain signal.
- * allow a certain signal (since daemonize() will
+ * Must not be used if kthread was cloned with CLONE_SIGHAND.
- * have disabled all of them by default).
 */
 int allow_signal(int sig)
 {
@@ -384,14 +383,14 @@ int allow_signal(int sig)
                return -EINVAL;
        spin_lock_irq(&current->sighand->siglock);
+        /* This is only needed for daemonize()'ed kthreads */
        sigdelset(&current->blocked, sig);
-        if (!current->mm) {
+        /*
-                /* Kernel threads handle their own signals.
+         * Kernel threads handle their own signals. Let the signal code
-                   Let the signal code know it'll be handled, so
+         * know it'll be handled, so that they don't get converted to
-                   that they don't get converted to SIGKILL or
+         * SIGKILL or just silently dropped.
-                   just silently dropped */
+         */
-                current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
+        current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
-        }
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        return 0;
@@ -590,7 +589,7 @@ retry:
        /*
         * Search in the siblings
         */
-        list_for_each_entry(c, &p->parent->children, sibling) {
+        list_for_each_entry(c, &p->real_parent->children, sibling) {
                if (c->mm == mm)
                        goto assign_new_owner;
        }
@@ -757,7 +756,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
        p->exit_signal = SIGCHLD;
        /* If it has exited notify the new parent about this child's death. */
-        if (!p->ptrace &&
+        if (!task_ptrace(p) &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                do_notify_parent(p, p->exit_signal);
                if (task_detached(p)) {
@@ -782,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                p->real_parent = reaper;
                if (p->parent == father) {
-                        BUG_ON(p->ptrace);
+                        BUG_ON(task_ptrace(p));
                        p->parent = p->real_parent;
                }
                reparent_thread(father, p, &dead_children);
@@ -975,16 +974,19 @@ NORET_TYPE void do_exit(long code)
                module_put(tsk->binfmt->module);
        proc_exit_connector(tsk);
+        /*
+         * Flush inherited counters to the parent - before the parent
+         * gets woken up by child-exit notifications.
+         */
+        perf_counter_exit_task(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
        mpol_put(tsk->mempolicy);
        tsk->mempolicy = NULL;
 #endif
 #ifdef CONFIG_FUTEX
-        /*
-         * This must happen late, after the PID is not
-         * hashed anymore:
-         */
        if (unlikely(!list_empty(&tsk->pi_state_list)))
                exit_pi_state_list(tsk);
        if (unlikely(current->pi_state_cache))
@@ -1077,6 +1079,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
        return 0;
 }
+struct wait_opts {
+        enum pid_type           wo_type;
+        int                     wo_flags;
+        struct pid              *wo_pid;
+        struct siginfo __user   *wo_info;
+        int __user              *wo_stat;
+        struct rusage __user    *wo_rusage;
+        int                     notask_error;
+};
 static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
        struct pid *pid = NULL;
@@ -1087,13 +1101,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
        return pid;
 }
-static int eligible_child(enum pid_type type, struct pid *pid, int options,
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
-                          struct task_struct *p)
 {
        int err;
-        if (type < PIDTYPE_MAX) {
+        if (wo->wo_type < PIDTYPE_MAX) {
-                if (task_pid_type(p, type) != pid)
+                if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
                        return 0;
        }
@@ -1102,8 +1115,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
         * set; otherwise, wait for non-clone children *only*.  (Note:
         * A "clone" child here is one that reports to its parent
         * using a signal other than SIGCHLD.) */
-        if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+        if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
-            && !(options & __WALL))
+            && !(wo->wo_flags & __WALL))
                return 0;
        err = security_task_wait(p);
@@ -1113,14 +1126,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
        return 1;
 }
-static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
+static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
-                               int why, int status,
+                                pid_t pid, uid_t uid, int why, int status)
-                               struct siginfo __user *infop,
-                               struct rusage __user *rusagep)
 {
-        int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+        struct siginfo __user *infop;
+        int retval = wo->wo_rusage
+                ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
        put_task_struct(p);
+        infop = wo->wo_info;
        if (!retval)
                retval = put_user(SIGCHLD, &infop->si_signo);
        if (!retval)
@@ -1144,19 +1158,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_zombie(struct task_struct *p, int options,
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
-                            struct siginfo __user *infop,
-                            int __user *stat_addr, struct rusage __user *ru)
 {
        unsigned long state;
        int retval, status, traced;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = __task_cred(p)->uid;
+        struct siginfo __user *infop;
-        if (!likely(options & WEXITED))
+        if (!likely(wo->wo_flags & WEXITED))
                return 0;
-        if (unlikely(options & WNOWAIT)) {
+        if (unlikely(wo->wo_flags & WNOWAIT)) {
                int exit_code = p->exit_code;
                int why, status;
@@ -1169,8 +1182,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
                        why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        status = exit_code & 0x7f;
                }
-                return wait_noreap_copyout(p, pid, uid, why,
+                return wait_noreap_copyout(wo, p, pid, uid, why, status);
-                                           status, infop, ru);
        }
        /*
@@ -1184,11 +1196,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
        }
        traced = ptrace_reparented(p);
+        /*
-        if (likely(!traced)) {
+         * It can be ptraced but not reparented, check
+         * !task_detached() to filter out sub-threads.
+         */
+        if (likely(!traced) && likely(!task_detached(p))) {
                struct signal_struct *psig;
                struct signal_struct *sig;
-                struct task_cputime cputime;
                /*
                 * The resource counters for the group leader are in its
@@ -1201,26 +1215,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
                 * p->signal fields, because they are only touched by
                 * __exit_signal, which runs with tasklist_lock
                 * write-locked anyway, and so is excluded here.  We do
-                 * need to protect the access to p->parent->signal fields,
+                 * need to protect the access to parent->signal fields,
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
-                 *
-                 * We use thread_group_cputime() to get times for the thread
-                 * group, which consolidates times for all threads in the
-                 * group including the group leader.
                 */
-                thread_group_cputime(p, &cputime);
+                spin_lock_irq(&p->real_parent->sighand->siglock);
-                spin_lock_irq(&p->parent->sighand->siglock);
+                psig = p->real_parent->signal;
-                psig = p->parent->signal;
                sig = p->signal;
                psig->cutime =
                        cputime_add(psig->cutime,
-                        cputime_add(cputime.utime,
+                        cputime_add(p->utime,
-                                    sig->cutime));
+                        cputime_add(sig->utime,
+                                    sig->cutime)));
                psig->cstime =
                        cputime_add(psig->cstime,
-                        cputime_add(cputime.stime,
+                        cputime_add(p->stime,
-                                    sig->cstime));
+                        cputime_add(sig->stime,
+                                    sig->cstime)));
                psig->cgtime =
                        cputime_add(psig->cgtime,
                        cputime_add(p->gtime,
@@ -1242,7 +1253,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
                        sig->oublock + sig->coublock;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
-                spin_unlock_irq(&p->parent->sighand->siglock);
+                spin_unlock_irq(&p->real_parent->sighand->siglock);
        }
        /*
@@ -1251,11 +1262,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
         */
        read_unlock(&tasklist_lock);
-        retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+        retval = wo->wo_rusage
+                ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
-        if (!retval && stat_addr)
+        if (!retval && wo->wo_stat)
-                retval = put_user(status, stat_addr);
+                retval = put_user(status, wo->wo_stat);
+        infop = wo->wo_info;
        if (!retval && infop)
                retval = put_user(SIGCHLD, &infop->si_signo);
        if (!retval && infop)
@@ -1323,15 +1337,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_stopped(int ptrace, struct task_struct *p,
+static int wait_task_stopped(struct wait_opts *wo,
-                             int options, struct siginfo __user *infop,
+                                int ptrace, struct task_struct *p)
-                             int __user *stat_addr, struct rusage __user *ru)
 {
+        struct siginfo __user *infop;
        int retval, exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;
-        if (!(options & WUNTRACED))
+        /*
+         * Traditionally we see ptrace'd stopped tasks regardless of options.
+         */
+        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;
        exit_code = 0;
@@ -1345,7 +1362,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
        if (!exit_code)
                goto unlock_sig;
-        if (!unlikely(options & WNOWAIT))
+        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;
        /* don't need the RCU readlock here as we're holding a spinlock */
@@ -1367,14 +1384,15 @@ unlock_sig:
        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
-        if (unlikely(options & WNOWAIT))
+        if (unlikely(wo->wo_flags & WNOWAIT))
-                return wait_noreap_copyout(p, pid, uid,
+                return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
-                                           why, exit_code,
-                                           infop, ru);
+        retval = wo->wo_rusage
+                ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+        if (!retval && wo->wo_stat)
+                retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
-        retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+        infop = wo->wo_info;
-        if (!retval && stat_addr)
-                retval = put_user((exit_code << 8) | 0x7f, stat_addr);
        if (!retval && infop)
                retval = put_user(SIGCHLD, &infop->si_signo);
        if (!retval && infop)
@@ -1401,15 +1419,13 @@ unlock_sig:
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_continued(struct task_struct *p, int options,
+static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
-                               struct siginfo __user *infop,
-                               int __user *stat_addr, struct rusage __user *ru)
 {
        int retval;
        pid_t pid;
        uid_t uid;
-        if (!unlikely(options & WCONTINUED))
+        if (!unlikely(wo->wo_flags & WCONTINUED))
                return 0;
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1421,7 +1437,7 @@ static int wait_task_continued(struct task_struct *p, int options,
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
-        if (!unlikely(options & WNOWAIT))
+        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        uid = __task_cred(p)->uid;
        spin_unlock_irq(&p->sighand->siglock);
@@ -1430,17 +1446,17 @@ static int wait_task_continued(struct task_struct *p, int options,
        get_task_struct(p);
        read_unlock(&tasklist_lock);
-        if (!infop) {
+        if (!wo->wo_info) {
-                retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+                retval = wo->wo_rusage
+                        ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
                put_task_struct(p);
-                if (!retval && stat_addr)
+                if (!retval && wo->wo_stat)
-                        retval = put_user(0xffff, stat_addr);
+                        retval = put_user(0xffff, wo->wo_stat);
                if (!retval)
                        retval = pid;
        } else {
-                retval = wait_noreap_copyout(p, pid, uid,
+                retval = wait_noreap_copyout(wo, p, pid, uid,
-                                             CLD_CONTINUED, SIGCONT,
+                                             CLD_CONTINUED, SIGCONT);
-                                             infop, ru);
                BUG_ON(retval == 0);
        }
@@ -1450,19 +1466,16 @@ static int wait_task_continued(struct task_struct *p, int options,
 /*
 * Consider @p for a wait by @parent.
 *
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child,
+ * then ->notask_error is 0 if @p is an eligible child,
 * or another error from security_task_wait(), or still -ECHILD.
 */
-static int wait_consider_task(struct task_struct *parent, int ptrace,
+static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
-                              struct task_struct *p, int *notask_error,
+                                int ptrace, struct task_struct *p)
-                              enum pid_type type, struct pid *pid, int options,
-                              struct siginfo __user *infop,
-                              int __user *stat_addr, struct rusage __user *ru)
 {
-        int ret = eligible_child(type, pid, options, p);
+        int ret = eligible_child(wo, p);
        if (!ret)
                return ret;
@@ -1474,16 +1487,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
                 * to look for security policy problems, rather
                 * than for mysterious wait bugs.
                 */
-                if (*notask_error)
+                if (wo->notask_error)
-                        *notask_error = ret;
+                        wo->notask_error = ret;
+                return 0;
        }
-        if (likely(!ptrace) && unlikely(p->ptrace)) {
+        if (likely(!ptrace) && unlikely(task_ptrace(p))) {
                /*
                 * This child is hidden by ptrace.
                 * We aren't allowed to see it now, but eventually we will.
                 */
-                *notask_error = 0;
+                wo->notask_error = 0;
                return 0;
        }
@@ -1494,34 +1508,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
         * We don't reap group leaders with subthreads.
         */
        if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
-                return wait_task_zombie(p, options, infop, stat_addr, ru);
+                return wait_task_zombie(wo, p);
        /*
         * It's stopped or running now, so it might
         * later continue, exit, or stop again.
         */
-        *notask_error = 0;
+        wo->notask_error = 0;
        if (task_stopped_code(p, ptrace))
-                return wait_task_stopped(ptrace, p, options,
+                return wait_task_stopped(wo, ptrace, p);
-                                         infop, stat_addr, ru);
-        return wait_task_continued(p, options, infop, stat_addr, ru);
+        return wait_task_continued(wo, p);
 }
 /*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children,
+ * ->notask_error is 0 if there were any eligible children,
 * or another error from security_task_wait(), or still -ECHILD.
 */
-static int do_wait_thread(struct task_struct *tsk, int *notask_error,
+static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
-                          enum pid_type type, struct pid *pid, int options,
-                          struct siginfo __user *infop, int __user *stat_addr,
-                          struct rusage __user *ru)
 {
        struct task_struct *p;
@@ -1530,9 +1540,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
                 * Do not consider detached threads.
                 */
                if (!task_detached(p)) {
-                        int ret = wait_consider_task(tsk, 0, p, notask_error,
+                        int ret = wait_consider_task(wo, tsk, 0, p);
-                                                     type, pid, options,
-                                                     infop, stat_addr, ru);
                        if (ret)
                                return ret;
                }
@@ -1541,22 +1549,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
        return 0;
 }
-static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
+static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
-                          enum pid_type type, struct pid *pid, int options,
-                          struct siginfo __user *infop, int __user *stat_addr,
-                          struct rusage __user *ru)
 {
        struct task_struct *p;
-        /*
-         * Traditionally we see ptrace'd stopped tasks regardless of options.
-         */
-        options |= WUNTRACED;
        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-                int ret = wait_consider_task(tsk, 1, p, notask_error,
+                int ret = wait_consider_task(wo, tsk, 1, p);
-                                             type, pid, options,
-                                             infop, stat_addr, ru);
                if (ret)
                        return ret;
        }
@@ -1564,65 +1562,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
        return 0;
 }
-static long do_wait(enum pid_type type, struct pid *pid, int options,
+static long do_wait(struct wait_opts *wo)
-                    struct siginfo __user *infop, int __user *stat_addr,
-                    struct rusage __user *ru)
 {
        DECLARE_WAITQUEUE(wait, current);
        struct task_struct *tsk;
        int retval;
-        trace_sched_process_wait(pid);
+        trace_sched_process_wait(wo->wo_pid);
        add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
        /*
         * If there is nothing that can match our critiera just get out.
-         * We will clear @retval to zero if we see any child that might later
+         * We will clear ->notask_error to zero if we see any child that
-         * match our criteria, even if we are not able to reap it yet.
+         * might later match our criteria, even if we are not able to reap
+         * it yet.
         */
-        retval = -ECHILD;
+        wo->notask_error = -ECHILD;
-        if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
+        if ((wo->wo_type < PIDTYPE_MAX) &&
-                goto end;
+           (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+                goto notask;
-        current->state = TASK_INTERRUPTIBLE;
+        set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
        tsk = current;
        do {
-                int tsk_result = do_wait_thread(tsk, &retval,
+                retval = do_wait_thread(wo, tsk);
-                                                type, pid, options,
+                if (retval)
-                                                infop, stat_addr, ru);
-                if (!tsk_result)
-                        tsk_result = ptrace_do_wait(tsk, &retval,
-                                                    type, pid, options,
-                                                    infop, stat_addr, ru);
-                if (tsk_result) {
-                        /*
-                         * tasklist_lock is unlocked and we have a final result.
-                         */
-                        retval = tsk_result;
                        goto end;
-                }
-                if (options & __WNOTHREAD)
+                retval = ptrace_do_wait(wo, tsk);
+                if (retval)
+                        goto end;
+                if (wo->wo_flags & __WNOTHREAD)
                        break;
-                tsk = next_thread(tsk);
+        } while_each_thread(current, tsk);
-                BUG_ON(tsk->signal != current->signal);
-        } while (tsk != current);
        read_unlock(&tasklist_lock);
-        if (!retval && !(options & WNOHANG)) {
+notask:
+        retval = wo->notask_error;
+        if (!retval && !(wo->wo_flags & WNOHANG)) {
                retval = -ERESTARTSYS;
                if (!signal_pending(current)) {
                        schedule();
                        goto repeat;
                }
        }
 end:
-        current->state = TASK_RUNNING;
+        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit,&wait);
-        if (infop) {
+        if (wo->wo_info) {
+                struct siginfo __user *infop = wo->wo_info;
                if (retval > 0)
                        retval = 0;
                else {
@@ -1651,6 +1643,7 @@ end:
 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                infop, int, options, struct rusage __user *, ru)
 {
+        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;
@@ -1680,7 +1673,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        if (type < PIDTYPE_MAX)
                pid = find_get_pid(upid);
-        ret = do_wait(type, pid, options, infop, NULL, ru);
+        wo.wo_type      = type;
+        wo.wo_pid       = pid;
+        wo.wo_flags     = options;
+        wo.wo_info      = infop;
+        wo.wo_stat      = NULL;
+        wo.wo_rusage    = ru;
+        ret = do_wait(&wo);
        put_pid(pid);
        /* avoid REGPARM breakage on x86: */
@@ -1691,6 +1691,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
 {
+        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;
@@ -1712,7 +1713,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                pid = find_get_pid(upid);
        }
-        ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+        wo.wo_type      = type;
+        wo.wo_pid       = pid;
+        wo.wo_flags     = options | WEXITED;
+        wo.wo_info      = NULL;
+        wo.wo_stat      = stat_addr;
+        wo.wo_rusage    = ru;
+        ret = do_wait(&wo);
        put_pid(pid);
        /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 875ffbdd96d0..e6c04d462ab2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
-#include <linux/mnt_namespace.h>
 #include <linux/personality.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
@@ -61,8 +60,8 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
-#include <trace/sched.h>
 #include <linux/magic.h>
+#include <linux/perf_counter.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -71,6 +70,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <trace/events/sched.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -83,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
-DEFINE_TRACE(sched_process_fork);
 int nr_processes(void)
 {
        int cpu;
@@ -178,7 +177,7 @@ void __init fork_init(unsigned long mempages)
        /* create a slab on which task_structs can be allocated */
        task_struct_cachep =
                kmem_cache_create("task_struct", sizeof(struct task_struct),
-                        ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
        /* do the arch specific task caches init */
@@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
         * the value intact in a core dump, and to save the unnecessary
         * trouble otherwise.  Userland only wants this done for a sys_exit.
         */
-        if (tsk->clear_child_tid
+        if (tsk->clear_child_tid) {
-            && !(tsk->flags & PF_SIGNALED)
+                if (!(tsk->flags & PF_SIGNALED) &&
-            && atomic_read(&mm->mm_users) > 1) {
+                    atomic_read(&mm->mm_users) > 1) {
-                u32 __user * tidptr = tsk->clear_child_tid;
+                        /*
+                         * We don't check the error code - if userspace has
+                         * not set up a proper pointer then tough luck.
+                         */
+                        put_user(0, tsk->clear_child_tid);
+                        sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
+                                        1, NULL, NULL, 0);
+                }
                tsk->clear_child_tid = NULL;
-                /*
-                 * We don't check the error code - if userspace has
-                 * not set up a proper pointer then tough luck.
-                 */
-                put_user(0, tidptr);
-                sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
        }
 }
@@ -816,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct signal_struct *sig;
-        if (clone_flags & CLONE_THREAD) {
+        if (clone_flags & CLONE_THREAD)
-                atomic_inc(&current->signal->count);
-                atomic_inc(&current->signal->live);
                return 0;
-        }
        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
@@ -878,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig)
        kmem_cache_free(signal_cachep, sig);
 }
-static void cleanup_signal(struct task_struct *tsk)
-{
-        struct signal_struct *sig = tsk->signal;
-        atomic_dec(&sig->live);
-        if (atomic_dec_and_test(&sig->count))
-                __cleanup_signal(sig);
-}
 static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
@@ -982,6 +968,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (!p)
                goto fork_out;
+        ftrace_graph_init_task(p);
        rt_mutex_init_task(p);
 #ifdef CONFIG_PROVE_LOCKING
@@ -1027,7 +1015,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);
-        clear_tsk_thread_flag(p, TIF_SIGPENDING);
        init_sigpending(&p->pending);
        p->utime = cputime_zero;
@@ -1089,12 +1076,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-        if (unlikely(current->ptrace))
-                ptrace_fork(p, clone_flags);
+        p->bts = NULL;
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
+        retval = perf_counter_init_task(p);
+        if (retval)
+                goto bad_fork_cleanup_policy;
        if ((retval = audit_alloc(p)))
                goto bad_fork_cleanup_policy;
        /* copy all the process information */
@@ -1131,8 +1122,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                }
        }
-        ftrace_graph_init_task(p);
        p->pid = pid_nr(pid);
        p->tgid = p->pid;
        if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1130,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (current->nsproxy != p->nsproxy) {
                retval = ns_cgroup_clone(p, pid);
                if (retval)
-                        goto bad_fork_free_graph;
+                        goto bad_fork_free_pid;
        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,10 +1222,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                spin_unlock(&current->sighand->siglock);
                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
-                goto bad_fork_free_graph;
+                goto bad_fork_free_pid;
        }
        if (clone_flags & CLONE_THREAD) {
+                atomic_inc(&current->signal->count);
+                atomic_inc(&current->signal->live);
                p->group_leader = current->group_leader;
                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
        }
@@ -1266,10 +1257,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
+        perf_counter_fork(p);
        return p;
-bad_fork_free_graph:
-        ftrace_graph_exit_task(p);
 bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
@@ -1281,7 +1271,8 @@ bad_fork_cleanup_mm:
        if (p->mm)
                mmput(p->mm);
 bad_fork_cleanup_signal:
-        cleanup_signal(p);
+        if (!(clone_flags & CLONE_THREAD))
+                __cleanup_signal(p->signal);
 bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
@@ -1293,6 +1284,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
        audit_free(p);
 bad_fork_cleanup_policy:
+        perf_counter_free_task(p);
 #ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
@@ -1461,20 +1453,20 @@ void __init proc_caches_init(void)
 {
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-                        sighand_ctor);
+                        SLAB_NOTRACK, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
 }
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
        recalc_sigpending(); /* We sent fake signal, clean it up */
        spin_unlock_irq(&current->sighand->siglock);
+        /* prevent accounting of that task to load */
+        current->flags |= PF_FREEZING;
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!frozen(current))
                        break;
                schedule();
        }
+        /* Remove the accounting blocker */
+        current->flags &= ~PF_FREEZING;
        pr_debug("%s left refrigerator\n", current->comm);
        __set_current_state(save);
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d53a62..e18cfbdc7190 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
 */
 struct futex_q {
        struct plist_node list;
-        /* There can only be a single waiter */
+        /* Waiter reference */
-        wait_queue_head_t waiter;
+        struct task_struct *task;
        /* Which hash list lock to use: */
        spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
        /* Optional priority inheritance state: */
        struct futex_pi_state *pi_state;
-        struct task_struct *task;
+        /* rt_waiter storage for requeue_pi: */
+        struct rt_mutex_waiter *rt_waiter;
        /* Bitset for the optional bitmasked wakeup */
        u32 bitset;
@@ -241,6 +247,7 @@ again:
        if (err < 0)
                return err;
+        page = compound_head(page);
        lock_page(page);
        if (!page->mapping) {
                unlock_page(page);
@@ -278,6 +285,44 @@ void put_futex_key(int fshared, union futex_key *key)
        drop_futex_key_refs(key);
 }
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr:      pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+                                 1, 1, 0, NULL, NULL);
+        return ret < 0 ? ret : 0;
+}
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb:     the hash bucket the futex_q's reside in
+ * @key:    the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+                                        union futex_key *key)
+{
+        struct futex_q *this;
+        plist_for_each_entry(this, &hb->chain, list) {
+                if (match_futex(&this->key, key))
+                        return this;
+        }
+        return NULL;
+}
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 {
        u32 curval;
@@ -539,28 +584,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        return 0;
 }
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr:              the pi futex user address
+ * @hb:                 the pi futex hash bucket
+ * @key:                the futex key associated with uaddr and hb
+ * @ps:                 the pi_state pointer where we store the result of the
+ *                      lookup
+ * @task:               the task to perform the atomic lock work for.  This will
+ *                      be "current" except in the case of requeue pi.
+ * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Returns:
+ *  0 - ready to wait
+ *  1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+                                union futex_key *key,
+                                struct futex_pi_state **ps,
+                                struct task_struct *task, int set_waiters)
+{
+        int lock_taken, ret, ownerdied = 0;
+        u32 uval, newval, curval;
+retry:
+        ret = lock_taken = 0;
+        /*
+         * To avoid races, we attempt to take the lock here again
+         * (by doing a 0 -> TID atomic cmpxchg), while holding all
+         * the locks. It will most likely not succeed.
+         */
+        newval = task_pid_vnr(task);
+        if (set_waiters)
+                newval |= FUTEX_WAITERS;
+        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+        if (unlikely(curval == -EFAULT))
+                return -EFAULT;
+        /*
+         * Detect deadlocks.
+         */
+        if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+                return -EDEADLK;
+        /*
+         * Surprise - we got the lock. Just return to userspace:
+         */
+        if (unlikely(!curval))
+                return 1;
+        uval = curval;
+        /*
+         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+         * to wake at the next unlock.
+         */
+        newval = curval | FUTEX_WAITERS;
+        /*
+         * There are two cases, where a futex might have no owner (the
+         * owner TID is 0): OWNER_DIED. We take over the futex in this
+         * case. We also do an unconditional take over, when the owner
+         * of the futex died.
+         *
+         * This is safe as we are protected by the hash bucket lock !
+         */
+        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+                /* Keep the OWNER_DIED bit */
+                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+                ownerdied = 0;
+                lock_taken = 1;
+        }
+        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+        if (unlikely(curval == -EFAULT))
+                return -EFAULT;
+        if (unlikely(curval != uval))
+                goto retry;
+        /*
+         * We took the lock due to owner died take over.
+         */
+        if (unlikely(lock_taken))
+                return 1;
+        /*
+         * We dont have the lock. Look up the PI state (or create it if
+         * we are the first waiter):
+         */
+        ret = lookup_pi_state(uval, hb, key, ps);
+        if (unlikely(ret)) {
+                switch (ret) {
+                case -ESRCH:
+                        /*
+                         * No owner found for this futex. Check if the
+                         * OWNER_DIED bit is set to figure out whether
+                         * this is a robust futex or not.
+                         */
+                        if (get_futex_value_locked(&curval, uaddr))
+                                return -EFAULT;
+                        /*
+                         * We simply start over in case of a robust
+                         * futex. The code above will take the futex
+                         * and return happy.
+                         */
+                        if (curval & FUTEX_OWNER_DIED) {
+                                ownerdied = 1;
+                                goto retry;
+                        }
+                default:
+                        break;
+                }
+        }
+        return ret;
+}
 /*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed.
 */
 static void wake_futex(struct futex_q *q)
 {
-        plist_del(&q->list, &q->list.plist);
+        struct task_struct *p = q->task;
        /*
-         * The lock in wake_up_all() is a crucial memory barrier after the
+         * We set q->lock_ptr = NULL _before_ we wake up the task. If
-         * plist_del() and also before assigning to q->lock_ptr.
+         * a non futex wake up happens on another CPU then the task
+         * might exit and p would dereference a non existing task
+         * struct. Prevent this by holding a reference on p across the
+         * wake up.
         */
-        wake_up(&q->waiter);
+        get_task_struct(p);
+        plist_del(&q->list, &q->list.plist);
        /*
-         * The waiting task can free the futex_q as soon as this is written,
+         * The waiting task can free the futex_q as soon as
-         * without taking any locks.  This must come last.
+         * q->lock_ptr = NULL is written, without taking any locks. A
-         *
+         * memory barrier is required here to prevent the following
-         * A memory barrier is required here to prevent the following store to
+         * store to lock_ptr from getting ahead of the plist_del.
-         * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-         * end of wake_up() does not prevent this store from moving.
         */
        smp_wmb();
        q->lock_ptr = NULL;
+        wake_up_state(p, TASK_NORMAL);
+        put_task_struct(p);
 }
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +866,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key)) {
-                        if (this->pi_state) {
+                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                break;
                        }
@@ -739,7 +916,6 @@ retry:
 retry_private:
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
-                u32 dummy;
                double_unlock_hb(hb1, hb2);
@@ -757,7 +933,7 @@ retry_private:
                        goto out_put_keys;
                }
-                ret = get_user(dummy, uaddr2);
+                ret = fault_in_user_writeable(uaddr2);
                if (ret)
                        goto out_put_keys;
@@ -802,24 +978,194 @@ out:
        return ret;
 }
-/*
+/**
- * Requeue all waiters hashed on one physical page to another
+ * requeue_futex() - Requeue a futex_q from one hb to another
- * physical page.
+ * @q:          the futex_q to requeue
+ * @hb1:        the source hash_bucket
+ * @hb2:        the target hash_bucket
+ * @key2:       the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+                   struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+        /*
+         * If key1 and key2 hash to the same bucket, no need to
+         * requeue.
+         */
+        if (likely(&hb1->chain != &hb2->chain)) {
+                plist_del(&q->list, &hb1->chain);
+                plist_add(&q->list, &hb2->chain);
+                q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+                q->list.plist.lock = &hb2->lock;
+#endif
+        }
+        get_futex_key_refs(key2);
+        q->key = *key2;
+}
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:   the futex_q
+ * key: the key of the requeue target futex
+ * hb:  the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later.  Must be called
+ * with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+                           struct futex_hash_bucket *hb)
+{
+        drop_futex_key_refs(&q->key);
+        get_futex_key_refs(key);
+        q->key = *key;
+        WARN_ON(plist_node_empty(&q->list));
+        plist_del(&q->list, &q->list.plist);
+        WARN_ON(!q->rt_waiter);
+        q->rt_waiter = NULL;
+        q->lock_ptr = &hb->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+        q->list.plist.lock = &hb->lock;
+#endif
+        wake_up_state(q->task, TASK_NORMAL);
+}
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:            the user address of the to futex
+ * @hb1:                the from futex hash bucket, must be locked by the caller
+ * @hb2:                the to futex hash bucket, must be locked by the caller
+ * @key1:               the from futex key
+ * @key2:               the to futex key
+ * @ps:                 address to store the pi_state pointer
+ * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+                                 struct futex_hash_bucket *hb1,
+                                 struct futex_hash_bucket *hb2,
+                                 union futex_key *key1, union futex_key *key2,
+                                 struct futex_pi_state **ps, int set_waiters)
+{
+        struct futex_q *top_waiter = NULL;
+        u32 curval;
+        int ret;
+        if (get_futex_value_locked(&curval, pifutex))
+                return -EFAULT;
+        /*
+         * Find the top_waiter and determine if there are additional waiters.
+         * If the caller intends to requeue more than 1 waiter to pifutex,
+         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+         * as we have means to handle the possible fault.  If not, don't set
+         * the bit unecessarily as it will force the subsequent unlock to enter
+         * the kernel.
+         */
+        top_waiter = futex_top_waiter(hb1, key1);
+        /* There are no waiters, nothing for us to do. */
+        if (!top_waiter)
+                return 0;
+        /*
+         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
+         * the contended case or if set_waiters is 1.  The pi_state is returned
+         * in ps in contended cases.
+         */
+        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+                                   set_waiters);
+        if (ret == 1)
+                requeue_pi_wake_futex(top_waiter, key2, hb2);
+        return ret;
+}
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:      source futex user address
+ * uaddr2:      target futex user address
+ * nr_wake:     number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue:  number of waiters to requeue (0-INT_MAX)
+ * requeue_pi:  if we are attempting to requeue from a non-pi futex to a
+ *              pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
 */
 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                         int nr_wake, int nr_requeue, u32 *cmpval)
+                         int nr_wake, int nr_requeue, u32 *cmpval,
+                         int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+        int drop_count = 0, task_count = 0, ret;
+        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head1;
        struct futex_q *this, *next;
-        int ret, drop_count = 0;
+        u32 curval2;
+        if (requeue_pi) {
+                /*
+                 * requeue_pi requires a pi_state, try to allocate it now
+                 * without any locks in case it fails.
+                 */
+                if (refill_pi_state_cache())
+                        return -ENOMEM;
+                /*
+                 * requeue_pi must wake as many tasks as it can, up to nr_wake
+                 * + nr_requeue, since it acquires the rt_mutex prior to
+                 * returning to userspace, so as to not leave the rt_mutex with
+                 * waiters and no owner.  However, second and third wake-ups
+                 * cannot be predicted as they involve race conditions with the
+                 * first wake and a fault while looking up the pi_state.  Both
+                 * pthread_cond_signal() and pthread_cond_broadcast() should
+                 * use nr_wake=1.
+                 */
+                if (nr_wake != 1)
+                        return -EINVAL;
+        }
 retry:
+        if (pi_state != NULL) {
+                /*
+                 * We will have to lookup the pi_state again, so free this one
+                 * to keep the accounting correct.
+                 */
+                free_pi_state(pi_state);
+                pi_state = NULL;
+        }
        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ);
+        ret = get_futex_key(uaddr2, fshared, &key2,
+                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -854,32 +1200,106 @@ retry_private:
                }
        }
+        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+                /*
+                 * Attempt to acquire uaddr2 and wake the top waiter. If we
+                 * intend to requeue waiters, force setting the FUTEX_WAITERS
+                 * bit.  We force this here where we are able to easily handle
+                 * faults rather in the requeue loop below.
+                 */
+                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+                                                 &key2, &pi_state, nr_requeue);
+                /*
+                 * At this point the top_waiter has either taken uaddr2 or is
+                 * waiting on it.  If the former, then the pi_state will not
+                 * exist yet, look it up one more time to ensure we have a
+                 * reference to it.
+                 */
+                if (ret == 1) {
+                        WARN_ON(pi_state);
+                        task_count++;
+                        ret = get_futex_value_locked(&curval2, uaddr2);
+                        if (!ret)
+                                ret = lookup_pi_state(curval2, hb2, &key2,
+                                                      &pi_state);
+                }
+                switch (ret) {
+                case 0:
+                        break;
+                case -EFAULT:
+                        double_unlock_hb(hb1, hb2);
+                        put_futex_key(fshared, &key2);
+                        put_futex_key(fshared, &key1);
+                        ret = fault_in_user_writeable(uaddr2);
+                        if (!ret)
+                                goto retry;
+                        goto out;
+                case -EAGAIN:
+                        /* The owner was exiting, try again. */
+                        double_unlock_hb(hb1, hb2);
+                        put_futex_key(fshared, &key2);
+                        put_futex_key(fshared, &key1);
+                        cond_resched();
+                        goto retry;
+                default:
+                        goto out_unlock;
+                }
+        }
        head1 = &hb1->chain;
        plist_for_each_entry_safe(this, next, head1, list) {
-                if (!match_futex (&this->key, &key1))
+                if (task_count - nr_wake >= nr_requeue)
+                        break;
+                if (!match_futex(&this->key, &key1))
                        continue;
-                if (++ret <= nr_wake) {
+                /*
+                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+                 * be paired with each other and no other futex ops.
+                 */
+                if ((requeue_pi && !this->rt_waiter) ||
+                    (!requeue_pi && this->rt_waiter)) {
+                        ret = -EINVAL;
+                        break;
+                }
+                /*
+                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+                 * lock, we already woke the top_waiter.  If not, it will be
+                 * woken by futex_unlock_pi().
+                 */
+                if (++task_count <= nr_wake && !requeue_pi) {
                        wake_futex(this);
-                } else {
+                        continue;
-                        /*
+                }
-                         * If key1 and key2 hash to the same bucket, no need to
-                         * requeue.
-                         */
-                        if (likely(head1 != &hb2->chain)) {
-                                plist_del(&this->list, &hb1->chain);
-                                plist_add(&this->list, &hb2->chain);
-                                this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-                                this->list.plist.lock = &hb2->lock;
-#endif
-                        }
-                        this->key = key2;
-                        get_futex_key_refs(&key2);
-                        drop_count++;
-                        if (ret - nr_wake >= nr_requeue)
+                /*
-                                break;
+                 * Requeue nr_requeue waiters and possibly one more in the case
+                 * of requeue_pi if we couldn't acquire the lock atomically.
+                 */
+                if (requeue_pi) {
+                        /* Prepare the waiter to take the rt_mutex. */
+                        atomic_inc(&pi_state->refcount);
+                        this->pi_state = pi_state;
+                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+                                                        this->rt_waiter,
+                                                        this->task, 1);
+                        if (ret == 1) {
+                                /* We got the lock. */
+                                requeue_pi_wake_futex(this, &key2, hb2);
+                                continue;
+                        } else if (ret) {
+                                /* -EDEADLK */
+                                this->pi_state = NULL;
+                                free_pi_state(pi_state);
+                                goto out_unlock;
+                        }
                }
+                requeue_futex(this, hb1, hb2, &key2);
+                drop_count++;
        }
 out_unlock:
@@ -899,7 +1319,9 @@ out_put_keys:
 out_put_key1:
        put_futex_key(fshared, &key1);
 out:
-        return ret;
+        if (pi_state != NULL)
+                free_pi_state(pi_state);
+        return ret ? ret : task_count;
 }
 /* The key must be already stored in q->key. */
@@ -907,8 +1329,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
-        init_waitqueue_head(&q->waiter);
        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1097,7 +1517,7 @@ retry:
 handle_fault:
        spin_unlock(q->lock_ptr);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        spin_lock(q->lock_ptr);
@@ -1119,35 +1539,149 @@ handle_fault:
 */
 #define FLAGS_SHARED            0x01
 #define FLAGS_CLOCKRT           0x02
+#define FLAGS_HAS_TIMEOUT       0x04
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, int fshared,
+/**
-                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr:      user address of the futex
+ * @fshared:    whether the futex is shared (1) or not (0)
+ * @q:          futex_q (contains pi_state and access to the rt_mutex)
+ * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ *  1 - success, lock taken
+ *  0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+                       int locked)
 {
-        struct task_struct *curr = current;
+        struct task_struct *owner;
-        struct restart_block *restart;
+        int ret = 0;
-        DECLARE_WAITQUEUE(wait, curr);
-        struct futex_hash_bucket *hb;
-        struct futex_q q;
-        u32 uval;
-        int ret;
-        struct hrtimer_sleeper t;
-        int rem = 0;
-        if (!bitset)
+        if (locked) {
-                return -EINVAL;
+                /*
+                 * Got the lock. We might not be the anticipated owner if we
+                 * did a lock-steal - fix up the PI-state in that case:
+                 */
+                if (q->pi_state->owner != current)
+                        ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                goto out;
+        }
-        q.pi_state = NULL;
+        /*
-        q.bitset = bitset;
+         * Catch the rare case, where the lock was released when we were on the
-retry:
+         * way back before we locked the hash bucket.
-        q.key = FUTEX_KEY_INIT;
+         */
-        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ);
+        if (q->pi_state->owner == current) {
-        if (unlikely(ret != 0))
+                /*
+                 * Try to get the rt_mutex now. This might fail as some other
+                 * task acquired the rt_mutex after we removed ourself from the
+                 * rt_mutex waiters list.
+                 */
+                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+                        locked = 1;
+                        goto out;
+                }
+                /*
+                 * pi_state is incorrect, some other task did a lock steal and
+                 * we returned due to timeout or signal without taking the
+                 * rt_mutex. Too late. We can access the rt_mutex_owner without
+                 * locking, as the other task is now blocked on the hash bucket
+                 * lock. Fix the state up.
+                 */
+                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+                ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
                goto out;
+        }
-retry_private:
+        /*
-        hb = queue_lock(&q);
+         * Paranoia check. If we did not take the lock, then we should not be
+         * the owner, nor the pending owner, of the rt_mutex.
+         */
+        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+                                "pi-state %p\n", ret,
+                                q->pi_state->pi_mutex.owner,
+                                q->pi_state->owner);
+out:
+        return ret ? ret : locked;
+}
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb:         the futex hash bucket, must be locked by the caller
+ * @q:          the futex_q to queue up on
+ * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+                                struct hrtimer_sleeper *timeout)
+{
+        queue_me(q, hb);
+        /*
+         * There might have been scheduling since the queue_me(), as we
+         * cannot hold a spinlock across the get_user() in case it
+         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+         * queueing ourselves into the futex hash. This code thus has to
+         * rely on the futex_wake() code removing us from hash when it
+         * wakes us up.
+         */
+        set_current_state(TASK_INTERRUPTIBLE);
+        /* Arm the timer */
+        if (timeout) {
+                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+                if (!hrtimer_active(&timeout->timer))
+                        timeout->task = NULL;
+        }
+        /*
+         * !plist_node_empty() is safe here without any lock.
+         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+         */
+        if (likely(!plist_node_empty(&q->list))) {
+                /*
+                 * If the timer has already expired, current will already be
+                 * flagged for rescheduling. Only call schedule if there
+                 * is no timeout, or if it has yet to expire.
+                 */
+                if (!timeout || timeout->task)
+                        schedule();
+        }
+        __set_current_state(TASK_RUNNING);
+}
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr:      the futex userspace address
+ * @val:        the expected value
+ * @fshared:    whether the futex is shared (1) or not (0)
+ * @q:          the associated futex_q
+ * @hb:         storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+ * compare it with the expected value.  Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ *  0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+                           struct futex_q *q, struct futex_hash_bucket **hb)
+{
+        u32 uval;
+        int ret;
        /*
         * Access the page AFTER the hash-bucket is locked.
@@ -1165,95 +1699,83 @@ retry_private:
         * A consequence is that futex_wait() can return zero and absorb
         * a wakeup when *uaddr != val on entry to the syscall.  This is
         * rare, but normal.
-         *
-         * For shared futexes, we hold the mmap semaphore, so the mapping
-         * cannot have changed since we looked it up in get_futex_key.
         */
+retry:
+        q->key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+        if (unlikely(ret != 0))
+                return ret;
+retry_private:
+        *hb = queue_lock(q);
        ret = get_futex_value_locked(&uval, uaddr);
-        if (unlikely(ret)) {
+        if (ret) {
-                queue_unlock(&q, hb);
+                queue_unlock(q, *hb);
                ret = get_user(uval, uaddr);
                if (ret)
-                        goto out_put_key;
+                        goto out;
                if (!fshared)
                        goto retry_private;
-                put_futex_key(fshared, &q.key);
+                put_futex_key(fshared, &q->key);
                goto retry;
        }
-        ret = -EWOULDBLOCK;
-        if (unlikely(uval != val)) {
-                queue_unlock(&q, hb);
-                goto out_put_key;
-        }
-        /* Only actually queue if *uaddr contained val.  */
+        if (uval != val) {
-        queue_me(&q, hb);
+                queue_unlock(q, *hb);
+                ret = -EWOULDBLOCK;
+        }
-        /*
+out:
-         * There might have been scheduling since the queue_me(), as we
+        if (ret)
-         * cannot hold a spinlock across the get_user() in case it
+                put_futex_key(fshared, &q->key);
-         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+        return ret;
-         * queueing ourselves into the futex hash.  This code thus has to
+}
-         * rely on the futex_wake() code removing us from hash when it
-         * wakes us up.
-         */
-        /* add_wait_queue is the barrier after __set_current_state. */
+static int futex_wait(u32 __user *uaddr, int fshared,
-        __set_current_state(TASK_INTERRUPTIBLE);
+                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
-        add_wait_queue(&q.waiter, &wait);
+{
-        /*
+        struct hrtimer_sleeper timeout, *to = NULL;
-         * !plist_node_empty() is safe here without any lock.
+        struct restart_block *restart;
-         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+        struct futex_hash_bucket *hb;
-         */
+        struct futex_q q;
-        if (likely(!plist_node_empty(&q.list))) {
+        int ret;
-                if (!abs_time)
-                        schedule();
-                else {
-                        hrtimer_init_on_stack(&t.timer,
-                                              clockrt ? CLOCK_REALTIME :
-                                              CLOCK_MONOTONIC,
-                                              HRTIMER_MODE_ABS);
-                        hrtimer_init_sleeper(&t, current);
-                        hrtimer_set_expires_range_ns(&t.timer, *abs_time,
-                                                     current->timer_slack_ns);
-                        hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-                        if (!hrtimer_active(&t.timer))
-                                t.task = NULL;
-                        /*
+        if (!bitset)
-                         * the timer could have already expired, in which
+                return -EINVAL;
-                         * case current would be flagged for rescheduling.
-                         * Don't bother calling schedule.
-                         */
-                        if (likely(t.task))
-                                schedule();
-                        hrtimer_cancel(&t.timer);
+        q.pi_state = NULL;
+        q.bitset = bitset;
+        q.rt_waiter = NULL;
-                        /* Flag if a timeout occured */
+        if (abs_time) {
-                        rem = (t.task == NULL);
+                to = &timeout;
-                        destroy_hrtimer_on_stack(&t.timer);
+                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                }
+                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                hrtimer_init_sleeper(to, current);
+                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                             current->timer_slack_ns);
        }
-        __set_current_state(TASK_RUNNING);
-        /*
+        /* Prepare to wait on uaddr. */
-         * NOTE: we don't remove ourselves from the waitqueue because
+        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
-         * we are the only user of it.
+        if (ret)
-         */
+                goto out;
+        /* queue_me and wait for wakeup, timeout, or a signal. */
+        futex_wait_queue_me(hb, &q, to);
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
        if (!unqueue_me(&q))
                goto out_put_key;
        ret = -ETIMEDOUT;
-        if (rem)
+        if (to && !to->task)
                goto out_put_key;
        /*
@@ -1270,7 +1792,7 @@ retry_private:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = 0;
+        restart->futex.flags = FLAGS_HAS_TIMEOUT;
        if (fshared)
                restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1804,10 @@ retry_private:
 out_put_key:
        put_futex_key(fshared, &q.key);
 out:
+        if (to) {
+                hrtimer_cancel(&to->timer);
+                destroy_hrtimer_on_stack(&to->timer);
+        }
        return ret;
 }
@@ -1290,13 +1816,16 @@ static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
        int fshared = 0;
-        ktime_t t;
+        ktime_t t, *tp = NULL;
-        t.tv64 = restart->futex.time;
+        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+                t.tv64 = restart->futex.time;
+                tp = &t;
+        }
        restart->fn = do_no_restart_syscall;
        if (restart->futex.flags & FLAGS_SHARED)
                fshared = 1;
-        return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+        return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
                                restart->futex.bitset,
                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1312,11 +1841,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                         int detect, ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
-        struct task_struct *curr = current;
        struct futex_hash_bucket *hb;
-        u32 uval, newval, curval;
        struct futex_q q;
-        int ret, lock_taken, ownerdied = 0;
+        int res, ret;
        if (refill_pi_state_cache())
                return -ENOMEM;
@@ -1330,6 +1857,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
        }
        q.pi_state = NULL;
+        q.rt_waiter = NULL;
 retry:
        q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1867,15 @@ retry:
 retry_private:
        hb = queue_lock(&q);
-retry_locked:
+        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
-        ret = lock_taken = 0;
-        /*
-         * To avoid races, we attempt to take the lock here again
-         * (by doing a 0 -> TID atomic cmpxchg), while holding all
-         * the locks. It will most likely not succeed.
-         */
-        newval = task_pid_vnr(current);
-        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-        if (unlikely(curval == -EFAULT))
-                goto uaddr_faulted;
-        /*
-         * Detect deadlocks. In case of REQUEUE_PI this is a valid
-         * situation and we return success to user space.
-         */
-        if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
-                ret = -EDEADLK;
-                goto out_unlock_put_key;
-        }
-        /*
-         * Surprise - we got the lock. Just return to userspace:
-         */
-        if (unlikely(!curval))
-                goto out_unlock_put_key;
-        uval = curval;
-        /*
-         * Set the WAITERS flag, so the owner will know it has someone
-         * to wake at next unlock
-         */
-        newval = curval | FUTEX_WAITERS;
-        /*
-         * There are two cases, where a futex might have no owner (the
-         * owner TID is 0): OWNER_DIED. We take over the futex in this
-         * case. We also do an unconditional take over, when the owner
-         * of the futex died.
-         *
-         * This is safe as we are protected by the hash bucket lock !
-         */
-        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-                /* Keep the OWNER_DIED bit */
-                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
-                ownerdied = 0;
-                lock_taken = 1;
-        }
-        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-        if (unlikely(curval == -EFAULT))
-                goto uaddr_faulted;
-        if (unlikely(curval != uval))
-                goto retry_locked;
-        /*
-         * We took the lock due to owner died take over.
-         */
-        if (unlikely(lock_taken))
-                goto out_unlock_put_key;
-        /*
-         * We dont have the lock. Look up the PI state (or create it if
-         * we are the first waiter):
-         */
-        ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
        if (unlikely(ret)) {
                switch (ret) {
+                case 1:
+                        /* We got the lock. */
+                        ret = 0;
+                        goto out_unlock_put_key;
+                case -EFAULT:
+                        goto uaddr_faulted;
                case -EAGAIN:
                        /*
                         * Task is exiting and we just wait for the
@@ -1423,25 +1885,6 @@ retry_locked:
                        put_futex_key(fshared, &q.key);
                        cond_resched();
                        goto retry;
-                case -ESRCH:
-                        /*
-                         * No owner found for this futex. Check if the
-                         * OWNER_DIED bit is set to figure out whether
-                         * this is a robust futex or not.
-                         */
-                        if (get_futex_value_locked(&curval, uaddr))
-                                goto uaddr_faulted;
-                        /*
-                         * We simply start over in case of a robust
-                         * futex. The code above will take the futex
-                         * and return happy.
-                         */
-                        if (curval & FUTEX_OWNER_DIED) {
-                                ownerdied = 1;
-                                goto retry_locked;
-                        }
                default:
                        goto out_unlock_put_key;
                }
@@ -1465,71 +1908,21 @@ retry_locked:
        }
        spin_lock(q.lock_ptr);
+        /*
-        if (!ret) {
+         * Fixup the pi_state owner and possibly acquire the lock if we
-                /*
+         * haven't already.
-                 * Got the lock. We might not be the anticipated owner
+         */
-                 * if we did a lock-steal - fix up the PI-state in
+        res = fixup_owner(uaddr, fshared, &q, !ret);
-                 * that case:
+        /*
-                 */
+         * If fixup_owner() returned an error, proprogate that.  If it acquired
-                if (q.pi_state->owner != curr)
+         * the lock, clear our -ETIMEDOUT or -EINTR.
-                        ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
+         */
-        } else {
+        if (res)
-                /*
+                ret = (res < 0) ? res : 0;
-                 * Catch the rare case, where the lock was released
-                 * when we were on the way back before we locked the
-                 * hash bucket.
-                 */
-                if (q.pi_state->owner == curr) {
-                        /*
-                         * Try to get the rt_mutex now. This might
-                         * fail as some other task acquired the
-                         * rt_mutex after we removed ourself from the
-                         * rt_mutex waiters list.
-                         */
-                        if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-                                ret = 0;
-                        else {
-                                /*
-                                 * pi_state is incorrect, some other
-                                 * task did a lock steal and we
-                                 * returned due to timeout or signal
-                                 * without taking the rt_mutex. Too
-                                 * late. We can access the
-                                 * rt_mutex_owner without locking, as
-                                 * the other task is now blocked on
-                                 * the hash bucket lock. Fix the state
-                                 * up.
-                                 */
-                                struct task_struct *owner;
-                                int res;
-                                owner = rt_mutex_owner(&q.pi_state->pi_mutex);
-                                res = fixup_pi_state_owner(uaddr, &q, owner,
-                                                           fshared);
-                                /* propagate -EFAULT, if the fixup failed */
-                                if (res)
-                                        ret = res;
-                        }
-                } else {
-                        /*
-                         * Paranoia check. If we did not take the lock
-                         * in the trylock above, then we should not be
-                         * the owner of the rtmutex, neither the real
-                         * nor the pending one:
-                         */
-                        if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
-                                printk(KERN_ERR "futex_lock_pi: ret = %d "
-                                       "pi-mutex: %p pi-state %p\n", ret,
-                                       q.pi_state->pi_mutex.owner,
-                                       q.pi_state->owner);
-                }
-        }
        /*
-         * If fixup_pi_state_owner() faulted and was unable to handle the
+         * If fixup_owner() faulted and was unable to handle the fault, unlock
-         * fault, unlock it and return the fault to userspace.
+         * it and return the fault to userspace.
         */
        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
                rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1930,7 @@ retry_locked:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        if (to)
+        goto out;
-                destroy_hrtimer_on_stack(&to->timer);
-        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 out_unlock_put_key:
        queue_unlock(&q, hb);
@@ -1549,19 +1940,12 @@ out_put_key:
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
-        return ret;
+        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 uaddr_faulted:
-        /*
-         * We have to r/w  *(int __user *)uaddr, and we have to modify it
-         * atomically.  Therefore, if we continue to fault after get_user()
-         * below, we need to handle the fault ourselves, while still holding
-         * the mmap_sem.  This can occur if the uaddr is under contention as
-         * we have to drop the mmap_sem in order to call get_user().
-         */
        queue_unlock(&q, hb);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        if (ret)
                goto out_put_key;
@@ -1572,7 +1956,6 @@ uaddr_faulted:
        goto retry;
 }
 /*
 * Userspace attempted a TID -> 0 atomic transition, and failed.
 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1657,23 +2040,239 @@ out:
        return ret;
 pi_faulted:
-        /*
-         * We have to r/w  *(int __user *)uaddr, and we have to modify it
-         * atomically.  Therefore, if we continue to fault after get_user()
-         * below, we need to handle the fault ourselves, while still holding
-         * the mmap_sem.  This can occur if the uaddr is under contention as
-         * we have to drop the mmap_sem in order to call get_user().
-         */
        spin_unlock(&hb->lock);
        put_futex_key(fshared, &key);
-        ret = get_user(uval, uaddr);
+        ret = fault_in_user_writeable(uaddr);
        if (!ret)
                goto retry;
        return ret;
 }
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:         the hash_bucket futex_q was original enqueued on
+ * @q:          the futex_q woken while waiting to be requeued
+ * @key2:       the futex_key of the requeue target futex
+ * @timeout:    the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+                                   struct futex_q *q, union futex_key *key2,
+                                   struct hrtimer_sleeper *timeout)
+{
+        int ret = 0;
+        /*
+         * With the hb lock held, we avoid races while we process the wakeup.
+         * We only need to hold hb (and not hb2) to ensure atomicity as the
+         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+         * It can't be requeued from uaddr2 to something else since we don't
+         * support a PI aware source futex for requeue.
+         */
+        if (!match_futex(&q->key, key2)) {
+                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+                /*
+                 * We were woken prior to requeue by a timeout or a signal.
+                 * Unqueue the futex_q and determine which it was.
+                 */
+                plist_del(&q->list, &q->list.plist);
+                drop_futex_key_refs(&q->key);
+                if (timeout && !timeout->task)
+                        ret = -ETIMEDOUT;
+                else
+                        ret = -ERESTARTNOINTR;
+        }
+        return ret;
+}
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:      the futex we initialyl wait on (non-pi)
+ * @fshared:    whether the futexes are shared (1) or not (0).  They must be
+ *              the same type, no requeueing from private to shared, etc.
+ * @val:        the expected value of uaddr
+ * @abs_time:   absolute timeout
+ * @bitset:     32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:     the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+                                 u32 val, ktime_t *abs_time, u32 bitset,
+                                 int clockrt, u32 __user *uaddr2)
+{
+        struct hrtimer_sleeper timeout, *to = NULL;
+        struct rt_mutex_waiter rt_waiter;
+        struct rt_mutex *pi_mutex = NULL;
+        struct futex_hash_bucket *hb;
+        union futex_key key2;
+        struct futex_q q;
+        int res, ret;
+        if (!bitset)
+                return -EINVAL;
+        if (abs_time) {
+                to = &timeout;
+                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                hrtimer_init_sleeper(to, current);
+                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                             current->timer_slack_ns);
+        }
+        /*
+         * The waiter is allocated on our stack, manipulated by the requeue
+         * code while we sleep on uaddr.
+         */
+        debug_rt_mutex_init_waiter(&rt_waiter);
+        rt_waiter.task = NULL;
+        q.pi_state = NULL;
+        q.bitset = bitset;
+        q.rt_waiter = &rt_waiter;
+        key2 = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+        if (unlikely(ret != 0))
+                goto out;
+        /* Prepare to wait on uaddr. */
+        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        if (ret)
+                goto out_key2;
+        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+        futex_wait_queue_me(hb, &q, to);
+        spin_lock(&hb->lock);
+        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+        spin_unlock(&hb->lock);
+        if (ret)
+                goto out_put_keys;
+        /*
+         * In order for us to be here, we know our q.key == key2, and since
+         * we took the hb->lock above, we also know that futex_requeue() has
+         * completed and we no longer have to concern ourselves with a wakeup
+         * race with the atomic proxy lock acquition by the requeue code.
+         */
+        /* Check if the requeue code acquired the second futex for us. */
+        if (!q.rt_waiter) {
+                /*
+                 * Got the lock. We might not be the anticipated owner if we
+                 * did a lock-steal - fix up the PI-state in that case.
+                 */
+                if (q.pi_state && (q.pi_state->owner != current)) {
+                        spin_lock(q.lock_ptr);
+                        ret = fixup_pi_state_owner(uaddr2, &q, current,
+                                                   fshared);
+                        spin_unlock(q.lock_ptr);
+                }
+        } else {
+                /*
+                 * We have been woken up by futex_unlock_pi(), a timeout, or a
+                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+                 * the pi_state.
+                 */
+                WARN_ON(!&q.pi_state);
+                pi_mutex = &q.pi_state->pi_mutex;
+                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+                debug_rt_mutex_free_waiter(&rt_waiter);
+                spin_lock(q.lock_ptr);
+                /*
+                 * Fixup the pi_state owner and possibly acquire the lock if we
+                 * haven't already.
+                 */
+                res = fixup_owner(uaddr2, fshared, &q, !ret);
+                /*
+                 * If fixup_owner() returned an error, proprogate that.  If it
+                 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+                 */
+                if (res)
+                        ret = (res < 0) ? res : 0;
+                /* Unqueue and drop the lock. */
+                unqueue_me_pi(&q);
+        }
+        /*
+         * If fixup_pi_state_owner() faulted and was unable to handle the
+         * fault, unlock the rt_mutex and return the fault to userspace.
+         */
+        if (ret == -EFAULT) {
+                if (rt_mutex_owner(pi_mutex) == current)
+                        rt_mutex_unlock(pi_mutex);
+        } else if (ret == -EINTR) {
+                /*
+                 * We've already been requeued, but we have no way to
+                 * restart by calling futex_lock_pi() directly. We
+                 * could restart the syscall, but that will look at
+                 * the user space value and return right away. So we
+                 * drop back with EWOULDBLOCK to tell user space that
+                 * "val" has been changed. That's the same what the
+                 * restart of the syscall would do in
+                 * futex_wait_setup().
+                 */
+                ret = -EWOULDBLOCK;
+        }
+out_put_keys:
+        put_futex_key(fshared, &q.key);
+out_key2:
+        put_futex_key(fshared, &key2);
+out:
+        if (to) {
+                hrtimer_cancel(&to->timer);
+                destroy_hrtimer_on_stack(&to->timer);
+        }
+        return ret;
+}
 /*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.
@@ -1896,7 +2495,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                fshared = 1;
        clockrt = op & FUTEX_CLOCK_REALTIME;
-        if (clockrt && cmd != FUTEX_WAIT_BITSET)
+        if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
                return -ENOSYS;
        switch (cmd) {
@@ -1911,10 +2510,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                ret = futex_wake(uaddr, fshared, val, val3);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                    0);
                break;
        case FUTEX_WAKE_OP:
                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2531,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                if (futex_cmpxchg_enabled)
                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
                break;
+        case FUTEX_WAIT_REQUEUE_PI:
+                val3 = FUTEX_BITSET_MATCH_ANY;
+                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                                            clockrt, uaddr2);
+                break;
+        case FUTEX_CMP_REQUEUE_PI:
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                    1);
+                break;
        default:
                ret = -ENOSYS;
        }
@@ -1948,7 +2557,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        int cmd = op & FUTEX_CMD_MASK;
        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                      cmd == FUTEX_WAIT_BITSET)) {
+                      cmd == FUTEX_WAIT_BITSET ||
+                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                        return -EFAULT;
                if (!timespec_valid(&ts))
@@ -1960,11 +2570,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                tp = &t;
        }
        /*
-         * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
         */
        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-            cmd == FUTEX_WAKE_OP)
+            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
                val2 = (u32) (unsigned long) utime;
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
        int cmd = op & FUTEX_CMD_MASK;
        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                      cmd == FUTEX_WAIT_BITSET)) {
+                      cmd == FUTEX_WAIT_BITSET ||
+                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
                if (get_compat_timespec(&ts, utime))
                        return -EFAULT;
                if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
                        t = ktime_add_safe(ktime_get(), t);
                tp = &t;
        }
-        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
                val2 = (int) (unsigned long) utime;
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000000..22e9dcfaa3d3
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
+menu "GCOV-based kernel profiling"
+config GCOV_KERNEL
+        bool "Enable gcov-based kernel profiling"
+        depends on DEBUG_FS && CONSTRUCTORS
+        default n
+        ---help---
+        This option enables gcov-based code profiling (e.g. for code coverage
+        measurements).
+        If unsure, say N.
+        Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
+        for the entire kernel. To enable profiling for specific files or
+        directories, add a line similar to the following to the respective
+        Makefile:
+        For a single file (e.g. main.o):
+                GCOV_PROFILE_main.o := y
+        For all files in one directory:
+                GCOV_PROFILE := y
+        To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+        is specified, use:
+                GCOV_PROFILE_main.o := n
+        and:
+                GCOV_PROFILE := n
+        Note that the debugfs filesystem has to be mounted to access
+        profiling data.
+config GCOV_PROFILE_ALL
+        bool "Profile entire Kernel"
+        depends on GCOV_KERNEL
+        depends on S390 || X86
+        default n
+        ---help---
+        This options activates profiling for the entire kernel.
+        If unsure, say N.
+        Note that a kernel compiled with profiling flags will be significantly
+        larger and run slower. Also be sure to exclude files from profiling
+        which are not linked to the kernel image to prevent linker errors.
+endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000000..3f761001d517
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000000..9b22d03cc581
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
+/*
+ *  This code maintains a list of active profiling data structures.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ *    Based on the gcov-kernel patch by:
+ *               Hubertus Franke <frankeh@us.ibm.com>
+ *               Nigel Hinds <nhinds@us.ibm.com>
+ *               Rajan Ravindran <rajancr@us.ibm.com>
+ *               Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *               Paul Larson
+ */
+#define pr_fmt(fmt)     "gcov: " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+static struct gcov_info *gcov_info_head;
+static int gcov_events_enabled;
+static DEFINE_MUTEX(gcov_lock);
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+        static unsigned int gcov_version;
+        mutex_lock(&gcov_lock);
+        if (gcov_version == 0) {
+                gcov_version = info->version;
+                /*
+                 * Printing gcc's version magic may prove useful for debugging
+                 * incompatibility reports.
+                 */
+                pr_info("version magic: 0x%x\n", gcov_version);
+        }
+        /*
+         * Add new profiling data structure to list and inform event
+         * listener.
+         */
+        info->next = gcov_info_head;
+        gcov_info_head = info;
+        if (gcov_events_enabled)
+                gcov_event(GCOV_ADD, info);
+        mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+/**
+ * gcov_enable_events - enable event reporting through gcov_event()
+ *
+ * Turn on reporting of profiling data load/unload-events through the
+ * gcov_event() callback. Also replay all previous events once. This function
+ * is needed because some events are potentially generated too early for the
+ * callback implementation to handle them initially.
+ */
+void gcov_enable_events(void)
+{
+        struct gcov_info *info;
+        mutex_lock(&gcov_lock);
+        gcov_events_enabled = 1;
+        /* Perform event callback for previously registered entries. */
+        for (info = gcov_info_head; info; info = info->next)
+                gcov_event(GCOV_ADD, info);
+        mutex_unlock(&gcov_lock);
+}
+#ifdef CONFIG_MODULES
+static inline int within(void *addr, void *start, unsigned long size)
+{
+        return ((addr >= start) && (addr < start + size));
+}
+/* Update list and generate events when modules are unloaded. */
+static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
+                                void *data)
+{
+        struct module *mod = data;
+        struct gcov_info *info;
+        struct gcov_info *prev;
+        if (event != MODULE_STATE_GOING)
+                return NOTIFY_OK;
+        mutex_lock(&gcov_lock);
+        prev = NULL;
+        /* Remove entries located in module from linked list. */
+        for (info = gcov_info_head; info; info = info->next) {
+                if (within(info, mod->module_core, mod->core_size)) {
+                        if (prev)
+                                prev->next = info->next;
+                        else
+                                gcov_info_head = info->next;
+                        if (gcov_events_enabled)
+                                gcov_event(GCOV_REMOVE, info);
+                } else
+                        prev = info;
+        }
+        mutex_unlock(&gcov_lock);
+        return NOTIFY_OK;
+}
+static struct notifier_block gcov_nb = {
+        .notifier_call  = gcov_module_notifier,
+};
+static int __init gcov_init(void)
+{
+        return register_module_notifier(&gcov_nb);
+}
+device_initcall(gcov_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000000..ef3c3f88a7a3
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
+/*
+ *  This code exports profiling data as debugfs files to userspace.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ *    Based on the gcov-kernel patch by:
+ *               Hubertus Franke <frankeh@us.ibm.com>
+ *               Nigel Hinds <nhinds@us.ibm.com>
+ *               Rajan Ravindran <rajancr@us.ibm.com>
+ *               Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *               Paul Larson
+ *               Yi CDL Yang
+ */
+#define pr_fmt(fmt)     "gcov: " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include "gcov.h"
+/**
+ * struct gcov_node - represents a debugfs entry
+ * @list: list head for child node list
+ * @children: child nodes
+ * @all: list head for list of all nodes
+ * @parent: parent node
+ * @info: associated profiling data structure if not a directory
+ * @ghost: when an object file containing profiling data is unloaded we keep a
+ *         copy of the profiling data here to allow collecting coverage data
+ *         for cleanup code. Such a node is called a "ghost".
+ * @dentry: main debugfs entry, either a directory or data file
+ * @links: associated symbolic links
+ * @name: data file basename
+ *
+ * struct gcov_node represents an entity within the gcov/ subdirectory
+ * of debugfs. There are directory and data file nodes. The latter represent
+ * the actual synthesized data file plus any associated symbolic links which
+ * are needed by the gcov tool to work correctly.
+ */
+struct gcov_node {
+        struct list_head list;
+        struct list_head children;
+        struct list_head all;
+        struct gcov_node *parent;
+        struct gcov_info *info;
+        struct gcov_info *ghost;
+        struct dentry *dentry;
+        struct dentry **links;
+        char name[0];
+};
+static const char objtree[] = OBJTREE;
+static const char srctree[] = SRCTREE;
+static struct gcov_node root_node;
+static struct dentry *reset_dentry;
+static LIST_HEAD(all_head);
+static DEFINE_MUTEX(node_lock);
+/* If non-zero, keep copies of profiling data for unloaded modules. */
+static int gcov_persist = 1;
+static int __init gcov_persist_setup(char *str)
+{
+        unsigned long val;
+        if (strict_strtoul(str, 0, &val)) {
+                pr_warning("invalid gcov_persist parameter '%s'\n", str);
+                return 0;
+        }
+        gcov_persist = val;
+        pr_info("setting gcov_persist to %d\n", gcov_persist);
+        return 1;
+}
+__setup("gcov_persist=", gcov_persist_setup);
+/*
+ * seq_file.start() implementation for gcov data files. Note that the
+ * gcov_iterator interface is designed to be more restrictive than seq_file
+ * (no start from arbitrary position, etc.), to simplify the iterator
+ * implementation.
+ */
+static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        loff_t i;
+        gcov_iter_start(seq->private);
+        for (i = 0; i < *pos; i++) {
+                if (gcov_iter_next(seq->private))
+                        return NULL;
+        }
+        return seq->private;
+}
+/* seq_file.next() implementation for gcov data files. */
+static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
+{
+        struct gcov_iterator *iter = data;
+        if (gcov_iter_next(iter))
+                return NULL;
+        (*pos)++;
+        return iter;
+}
+/* seq_file.show() implementation for gcov data files. */
+static int gcov_seq_show(struct seq_file *seq, void *data)
+{
+        struct gcov_iterator *iter = data;
+        if (gcov_iter_write(iter, seq))
+                return -EINVAL;
+        return 0;
+}
+static void gcov_seq_stop(struct seq_file *seq, void *data)
+{
+        /* Unused. */
+}
+static const struct seq_operations gcov_seq_ops = {
+        .start  = gcov_seq_start,
+        .next   = gcov_seq_next,
+        .show   = gcov_seq_show,
+        .stop   = gcov_seq_stop,
+};
+/*
+ * Return the profiling data set for a given node. This can either be the
+ * original profiling data structure or a duplicate (also called "ghost")
+ * in case the associated object file has been unloaded.
+ */
+static struct gcov_info *get_node_info(struct gcov_node *node)
+{
+        if (node->info)
+                return node->info;
+        return node->ghost;
+}
+/*
+ * open() implementation for gcov data files. Create a copy of the profiling
+ * data set and initialize the iterator and seq_file interface.
+ */
+static int gcov_seq_open(struct inode *inode, struct file *file)
+{
+        struct gcov_node *node = inode->i_private;
+        struct gcov_iterator *iter;
+        struct seq_file *seq;
+        struct gcov_info *info;
+        int rc = -ENOMEM;
+        mutex_lock(&node_lock);
+        /*
+         * Read from a profiling data copy to minimize reference tracking
+         * complexity and concurrent access.
+         */
+        info = gcov_info_dup(get_node_info(node));
+        if (!info)
+                goto out_unlock;
+        iter = gcov_iter_new(info);
+        if (!iter)
+                goto err_free_info;
+        rc = seq_open(file, &gcov_seq_ops);
+        if (rc)
+                goto err_free_iter_info;
+        seq = file->private_data;
+        seq->private = iter;
+out_unlock:
+        mutex_unlock(&node_lock);
+        return rc;
+err_free_iter_info:
+        gcov_iter_free(iter);
+err_free_info:
+        gcov_info_free(info);
+        goto out_unlock;
+}
+/*
+ * release() implementation for gcov data files. Release resources allocated
+ * by open().
+ */
+static int gcov_seq_release(struct inode *inode, struct file *file)
+{
+        struct gcov_iterator *iter;
+        struct gcov_info *info;
+        struct seq_file *seq;
+        seq = file->private_data;
+        iter = seq->private;
+        info = gcov_iter_get_info(iter);
+        gcov_iter_free(iter);
+        gcov_info_free(info);
+        seq_release(inode, file);
+        return 0;
+}
+/*
+ * Find a node by the associated data file name. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *get_node_by_name(const char *name)
+{
+        struct gcov_node *node;
+        struct gcov_info *info;
+        list_for_each_entry(node, &all_head, all) {
+                info = get_node_info(node);
+                if (info && (strcmp(info->filename, name) == 0))
+                        return node;
+        }
+        return NULL;
+}
+static void remove_node(struct gcov_node *node);
+/*
+ * write() implementation for gcov data files. Reset profiling data for the
+ * associated file. If the object file has been unloaded (i.e. this is
+ * a "ghost" node), remove the debug fs node as well.
+ */
+static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
+                              size_t len, loff_t *pos)
+{
+        struct seq_file *seq;
+        struct gcov_info *info;
+        struct gcov_node *node;
+        seq = file->private_data;
+        info = gcov_iter_get_info(seq->private);
+        mutex_lock(&node_lock);
+        node = get_node_by_name(info->filename);
+        if (node) {
+                /* Reset counts or remove node for unloaded modules. */
+                if (node->ghost)
+                        remove_node(node);
+                else
+                        gcov_info_reset(node->info);
+        }
+        /* Reset counts for open file. */
+        gcov_info_reset(info);
+        mutex_unlock(&node_lock);
+        return len;
+}
+/*
+ * Given a string <path> representing a file path of format:
+ *   path/to/file.gcda
+ * construct and return a new string:
+ *   <dir/>path/to/file.<ext>
+ */
+static char *link_target(const char *dir, const char *path, const char *ext)
+{
+        char *target;
+        char *old_ext;
+        char *copy;
+        copy = kstrdup(path, GFP_KERNEL);
+        if (!copy)
+                return NULL;
+        old_ext = strrchr(copy, '.');
+        if (old_ext)
+                *old_ext = '\0';
+        if (dir)
+                target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
+        else
+                target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
+        kfree(copy);
+        return target;
+}
+/*
+ * Construct a string representing the symbolic link target for the given
+ * gcov data file name and link type. Depending on the link type and the
+ * location of the data file, the link target can either point to a
+ * subdirectory of srctree, objtree or in an external location.
+ */
+static char *get_link_target(const char *filename, const struct gcov_link *ext)
+{
+        const char *rel;
+        char *result;
+        if (strncmp(filename, objtree, strlen(objtree)) == 0) {
+                rel = filename + strlen(objtree) + 1;
+                if (ext->dir == SRC_TREE)
+                        result = link_target(srctree, rel, ext->ext);
+                else
+                        result = link_target(objtree, rel, ext->ext);
+        } else {
+                /* External compilation. */
+                result = link_target(NULL, filename, ext->ext);
+        }
+        return result;
+}
+#define SKEW_PREFIX     ".tmp_"
+/*
+ * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
+ * for filename skewing caused by the mod-versioning mechanism.
+ */
+static const char *deskew(const char *basename)
+{
+        if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
+                return basename + sizeof(SKEW_PREFIX) - 1;
+        return basename;
+}
+/*
+ * Create links to additional files (usually .c and .gcno files) which the
+ * gcov tool expects to find in the same directory as the gcov data file.
+ */
+static void add_links(struct gcov_node *node, struct dentry *parent)
+{
+        char *basename;
+        char *target;
+        int num;
+        int i;
+        for (num = 0; gcov_link[num].ext; num++)
+                /* Nothing. */;
+        node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
+        if (!node->links)
+                return;
+        for (i = 0; i < num; i++) {
+                target = get_link_target(get_node_info(node)->filename,
+                                         &gcov_link[i]);
+                if (!target)
+                        goto out_err;
+                basename = strrchr(target, '/');
+                if (!basename)
+                        goto out_err;
+                basename++;
+                node->links[i] = debugfs_create_symlink(deskew(basename),
+                                                        parent, target);
+                if (!node->links[i])
+                        goto out_err;
+                kfree(target);
+        }
+        return;
+out_err:
+        kfree(target);
+        while (i-- > 0)
+                debugfs_remove(node->links[i]);
+        kfree(node->links);
+        node->links = NULL;
+}
+static const struct file_operations gcov_data_fops = {
+        .open           = gcov_seq_open,
+        .release        = gcov_seq_release,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .write          = gcov_seq_write,
+};
+/* Basic initialization of a new node. */
+static void init_node(struct gcov_node *node, struct gcov_info *info,
+                      const char *name, struct gcov_node *parent)
+{
+        INIT_LIST_HEAD(&node->list);
+        INIT_LIST_HEAD(&node->children);
+        INIT_LIST_HEAD(&node->all);
+        node->info = info;
+        node->parent = parent;
+        if (name)
+                strcpy(node->name, name);
+}
+/*
+ * Create a new node and associated debugfs entry. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *new_node(struct gcov_node *parent,
+                                  struct gcov_info *info, const char *name)
+{
+        struct gcov_node *node;
+        node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
+        if (!node) {
+                pr_warning("out of memory\n");
+                return NULL;
+        }
+        init_node(node, info, name, parent);
+        /* Differentiate between gcov data file nodes and directory nodes. */
+        if (info) {
+                node->dentry = debugfs_create_file(deskew(node->name), 0600,
+                                        parent->dentry, node, &gcov_data_fops);
+        } else
+                node->dentry = debugfs_create_dir(node->name, parent->dentry);
+        if (!node->dentry) {
+                pr_warning("could not create file\n");
+                kfree(node);
+                return NULL;
+        }
+        if (info)
+                add_links(node, parent->dentry);
+        list_add(&node->list, &parent->children);
+        list_add(&node->all, &all_head);
+        return node;
+}
+/* Remove symbolic links associated with node. */
+static void remove_links(struct gcov_node *node)
+{
+        int i;
+        if (!node->links)
+                return;
+        for (i = 0; gcov_link[i].ext; i++)
+                debugfs_remove(node->links[i]);
+        kfree(node->links);
+        node->links = NULL;
+}
+/*
+ * Remove node from all lists and debugfs and release associated resources.
+ * Needs to be called with node_lock held.
+ */
+static void release_node(struct gcov_node *node)
+{
+        list_del(&node->list);
+        list_del(&node->all);
+        debugfs_remove(node->dentry);
+        remove_links(node);
+        if (node->ghost)
+                gcov_info_free(node->ghost);
+        kfree(node);
+}
+/* Release node and empty parents. Needs to be called with node_lock held. */
+static void remove_node(struct gcov_node *node)
+{
+        struct gcov_node *parent;
+        while ((node != &root_node) && list_empty(&node->children)) {
+                parent = node->parent;
+                release_node(node);
+                node = parent;
+        }
+}
+/*
+ * Find child node with given basename. Needs to be called with node_lock
+ * held.
+ */
+static struct gcov_node *get_child_by_name(struct gcov_node *parent,
+                                           const char *name)
+{
+        struct gcov_node *node;
+        list_for_each_entry(node, &parent->children, list) {
+                if (strcmp(node->name, name) == 0)
+                        return node;
+        }
+        return NULL;
+}
+/*
+ * write() implementation for reset file. Reset all profiling data to zero
+ * and remove ghost nodes.
+ */
+static ssize_t reset_write(struct file *file, const char __user *addr,
+                           size_t len, loff_t *pos)
+{
+        struct gcov_node *node;
+        mutex_lock(&node_lock);
+restart:
+        list_for_each_entry(node, &all_head, all) {
+                if (node->info)
+                        gcov_info_reset(node->info);
+                else if (list_empty(&node->children)) {
+                        remove_node(node);
+                        /* Several nodes may have gone - restart loop. */
+                        goto restart;
+                }
+        }
+        mutex_unlock(&node_lock);
+        return len;
+}
+/* read() implementation for reset file. Unused. */
+static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
+                          loff_t *pos)
+{
+        /* Allow read operation so that a recursive copy won't fail. */
+        return 0;
+}
+static const struct file_operations gcov_reset_fops = {
+        .write  = reset_write,
+        .read   = reset_read,
+};
+/*
+ * Create a node for a given profiling data set and add it to all lists and
+ * debugfs. Needs to be called with node_lock held.
+ */
+static void add_node(struct gcov_info *info)
+{
+        char *filename;
+        char *curr;
+        char *next;
+        struct gcov_node *parent;
+        struct gcov_node *node;
+        filename = kstrdup(info->filename, GFP_KERNEL);
+        if (!filename)
+                return;
+        parent = &root_node;
+        /* Create directory nodes along the path. */
+        for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
+                if (curr == next)
+                        continue;
+                *next = 0;
+                if (strcmp(curr, ".") == 0)
+                        continue;
+                if (strcmp(curr, "..") == 0) {
+                        if (!parent->parent)
+                                goto err_remove;
+                        parent = parent->parent;
+                        continue;
+                }
+                node = get_child_by_name(parent, curr);
+                if (!node) {
+                        node = new_node(parent, NULL, curr);
+                        if (!node)
+                                goto err_remove;
+                }
+                parent = node;
+        }
+        /* Create file node. */
+        node = new_node(parent, info, curr);
+        if (!node)
+                goto err_remove;
+out:
+        kfree(filename);
+        return;
+err_remove:
+        remove_node(parent);
+        goto out;
+}
+/*
+ * The profiling data set associated with this node is being unloaded. Store a
+ * copy of the profiling data and turn this node into a "ghost".
+ */
+static int ghost_node(struct gcov_node *node)
+{
+        node->ghost = gcov_info_dup(node->info);
+        if (!node->ghost) {
+                pr_warning("could not save data for '%s' (out of memory)\n",
+                           node->info->filename);
+                return -ENOMEM;
+        }
+        node->info = NULL;
+        return 0;
+}
+/*
+ * Profiling data for this node has been loaded again. Add profiling data
+ * from previous instantiation and turn this node into a regular node.
+ */
+static void revive_node(struct gcov_node *node, struct gcov_info *info)
+{
+        if (gcov_info_is_compatible(node->ghost, info))
+                gcov_info_add(info, node->ghost);
+        else {
+                pr_warning("discarding saved data for '%s' (version changed)\n",
+                           info->filename);
+        }
+        gcov_info_free(node->ghost);
+        node->ghost = NULL;
+        node->info = info;
+}
+/*
+ * Callback to create/remove profiling files when code compiled with
+ * -fprofile-arcs is loaded/unloaded.
+ */
+void gcov_event(enum gcov_action action, struct gcov_info *info)
+{
+        struct gcov_node *node;
+        mutex_lock(&node_lock);
+        node = get_node_by_name(info->filename);
+        switch (action) {
+        case GCOV_ADD:
+                /* Add new node or revive ghost. */
+                if (!node) {
+                        add_node(info);
+                        break;
+                }
+                if (gcov_persist)
+                        revive_node(node, info);
+                else {
+                        pr_warning("could not add '%s' (already exists)\n",
+                                   info->filename);
+                }
+                break;
+        case GCOV_REMOVE:
+                /* Remove node or turn into ghost. */
+                if (!node) {
+                        pr_warning("could not remove '%s' (not found)\n",
+                                   info->filename);
+                        break;
+                }
+                if (gcov_persist) {
+                        if (!ghost_node(node))
+                                break;
+                }
+                remove_node(node);
+                break;
+        }
+        mutex_unlock(&node_lock);
+}
+/* Create debugfs entries. */
+static __init int gcov_fs_init(void)
+{
+        int rc = -EIO;
+        init_node(&root_node, NULL, NULL, NULL);
+        /*
+         * /sys/kernel/debug/gcov will be parent for the reset control file
+         * and all profiling files.
+         */
+        root_node.dentry = debugfs_create_dir("gcov", NULL);
+        if (!root_node.dentry)
+                goto err_remove;
+        /*
+         * Create reset file which resets all profiling counts when written
+         * to.
+         */
+        reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+                                           NULL, &gcov_reset_fops);
+        if (!reset_dentry)
+                goto err_remove;
+        /* Replay previous events to get our fs hierarchy up-to-date. */
+        gcov_enable_events();
+        return 0;
+err_remove:
+        pr_err("init failed\n");
+        if (root_node.dentry)
+                debugfs_remove(root_node.dentry);
+        return rc;
+}
+device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000000..ae5bb4260033
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
+/*
+ *  This code provides functions to handle gcc's profiling data format
+ *  introduced with gcc 3.4. Future versions of gcc may change the gcov
+ *  format (as happened before), so all format-specific information needs
+ *  to be kept modular and easily exchangeable.
+ *
+ *  This file is based on gcc-internal definitions. Functions and data
+ *  structures are defined to be compatible with gcc counterparts.
+ *  For a better understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ */
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+        { OBJ_TREE, "gcno" },   /* Link to .gcno file in $(objtree). */
+        { 0, NULL},
+};
+/*
+ * Determine whether a counter is active. Based on gcc magic. Doesn't change
+ * at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+        return (1 << type) & info->ctr_mask;
+}
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+        unsigned int i;
+        unsigned int result = 0;
+        for (i = 0; i < GCOV_COUNTERS; i++) {
+                if (counter_active(info, i))
+                        result++;
+        }
+        return result;
+}
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+        unsigned int active = num_counter_active(info);
+        unsigned int i;
+        for (i = 0; i < active; i++) {
+                memset(info->counts[i].values, 0,
+                       info->counts[i].num * sizeof(gcov_type));
+        }
+}
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+        return (info1->stamp == info2->stamp);
+}
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
+{
+        unsigned int i;
+        unsigned int j;
+        for (i = 0; i < num_counter_active(dest); i++) {
+                for (j = 0; j < dest->counts[i].num; j++) {
+                        dest->counts[i].values[j] +=
+                                source->counts[i].values[j];
+                }
+        }
+}
+/* Get size of function info entry. Based on gcc magic. */
+static size_t get_fn_size(struct gcov_info *info)
+{
+        size_t size;
+        size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
+               sizeof(unsigned int);
+        if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
+                size = ALIGN(size, __alignof__(struct gcov_fn_info));
+        return size;
+}
+/* Get address of function info entry. Based on gcc magic. */
+static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
+{
+        return (struct gcov_fn_info *)
+                ((char *) info->functions + fn * get_fn_size(info));
+}
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+        struct gcov_info *dup;
+        unsigned int i;
+        unsigned int active;
+        /* Duplicate gcov_info. */
+        active = num_counter_active(info);
+        dup = kzalloc(sizeof(struct gcov_info) +
+                      sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+        if (!dup)
+                return NULL;
+        dup->version            = info->version;
+        dup->stamp              = info->stamp;
+        dup->n_functions        = info->n_functions;
+        dup->ctr_mask           = info->ctr_mask;
+        /* Duplicate filename. */
+        dup->filename           = kstrdup(info->filename, GFP_KERNEL);
+        if (!dup->filename)
+                goto err_free;
+        /* Duplicate table of functions. */
+        dup->functions = kmemdup(info->functions, info->n_functions *
+                                 get_fn_size(info), GFP_KERNEL);
+        if (!dup->functions)
+                goto err_free;
+        /* Duplicate counter arrays. */
+        for (i = 0; i < active ; i++) {
+                struct gcov_ctr_info *ctr = &info->counts[i];
+                size_t size = ctr->num * sizeof(gcov_type);
+                dup->counts[i].num = ctr->num;
+                dup->counts[i].merge = ctr->merge;
+                dup->counts[i].values = vmalloc(size);
+                if (!dup->counts[i].values)
+                        goto err_free;
+                memcpy(dup->counts[i].values, ctr->values, size);
+        }
+        return dup;
+err_free:
+        gcov_info_free(dup);
+        return NULL;
+}
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+        unsigned int active = num_counter_active(info);
+        unsigned int i;
+        for (i = 0; i < active ; i++)
+                vfree(info->counts[i].values);
+        kfree(info->functions);
+        kfree(info->filename);
+        kfree(info);
+}
+/**
+ * struct type_info - iterator helper array
+ * @ctr_type: counter type
+ * @offset: index of the first value of the current function for this type
+ *
+ * This array is needed to convert the in-memory data format into the in-file
+ * data format:
+ *
+ * In-memory:
+ *   for each counter type
+ *     for each function
+ *       values
+ *
+ * In-file:
+ *   for each function
+ *     for each counter type
+ *       values
+ *
+ * See gcc source gcc/gcov-io.h for more information on data organization.
+ */
+struct type_info {
+        int ctr_type;
+        unsigned int offset;
+};
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @record: record type
+ * @function: function number
+ * @type: counter type
+ * @count: index into values array
+ * @num_types: number of counter types
+ * @type_info: helper array to get values-array offset for current function
+ */
+struct gcov_iterator {
+        struct gcov_info *info;
+        int record;
+        unsigned int function;
+        unsigned int type;
+        unsigned int count;
+        int num_types;
+        struct type_info type_info[0];
+};
+static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
+{
+        return get_fn_info(iter->info, iter->function);
+}
+static struct type_info *get_type(struct gcov_iterator *iter)
+{
+        return &iter->type_info[iter->type];
+}
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+        struct gcov_iterator *iter;
+        iter = kzalloc(sizeof(struct gcov_iterator) +
+                       num_counter_active(info) * sizeof(struct type_info),
+                       GFP_KERNEL);
+        if (iter)
+                iter->info = info;
+        return iter;
+}
+/**
+ * gcov_iter_free - release memory for iterator
+ * @iter: file iterator to free
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+        kfree(iter);
+}
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+        return iter->info;
+}
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+        int i;
+        iter->record = 0;
+        iter->function = 0;
+        iter->type = 0;
+        iter->count = 0;
+        iter->num_types = 0;
+        for (i = 0; i < GCOV_COUNTERS; i++) {
+                if (counter_active(iter->info, i)) {
+                        iter->type_info[iter->num_types].ctr_type = i;
+                        iter->type_info[iter->num_types++].offset = 0;
+                }
+        }
+}
+/* Mapping of logical record number to actual file content. */
+#define RECORD_FILE_MAGIC       0
+#define RECORD_GCOV_VERSION     1
+#define RECORD_TIME_STAMP       2
+#define RECORD_FUNCTION_TAG     3
+#define RECORD_FUNCTON_TAG_LEN  4
+#define RECORD_FUNCTION_IDENT   5
+#define RECORD_FUNCTION_CHECK   6
+#define RECORD_COUNT_TAG        7
+#define RECORD_COUNT_LEN        8
+#define RECORD_COUNT            9
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+        switch (iter->record) {
+        case RECORD_FILE_MAGIC:
+        case RECORD_GCOV_VERSION:
+        case RECORD_FUNCTION_TAG:
+        case RECORD_FUNCTON_TAG_LEN:
+        case RECORD_FUNCTION_IDENT:
+        case RECORD_COUNT_TAG:
+                /* Advance to next record */
+                iter->record++;
+                break;
+        case RECORD_COUNT:
+                /* Advance to next count */
+                iter->count++;
+                /* fall through */
+        case RECORD_COUNT_LEN:
+                if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
+                        iter->record = 9;
+                        break;
+                }
+                /* Advance to next counter type */
+                get_type(iter)->offset += iter->count;
+                iter->count = 0;
+                iter->type++;
+                /* fall through */
+        case RECORD_FUNCTION_CHECK:
+                if (iter->type < iter->num_types) {
+                        iter->record = 7;
+                        break;
+                }
+                /* Advance to next function */
+                iter->type = 0;
+                iter->function++;
+                /* fall through */
+        case RECORD_TIME_STAMP:
+                if (iter->function < iter->info->n_functions)
+                        iter->record = 3;
+                else
+                        iter->record = -1;
+                break;
+        }
+        /* Check for EOF. */
+        if (iter->record == -1)
+                return -EINVAL;
+        else
+                return 0;
+}
+/**
+ * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file.
+ */
+static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
+{
+        return seq_write(seq, &v, sizeof(v));
+}
+/**
+ * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
+{
+        u32 data[2];
+        data[0] = (v & 0xffffffffUL);
+        data[1] = (v >> 32);
+        return seq_write(seq, data, sizeof(data));
+}
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+        int rc = -EINVAL;
+        switch (iter->record) {
+        case RECORD_FILE_MAGIC:
+                rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
+                break;
+        case RECORD_GCOV_VERSION:
+                rc = seq_write_gcov_u32(seq, iter->info->version);
+                break;
+        case RECORD_TIME_STAMP:
+                rc = seq_write_gcov_u32(seq, iter->info->stamp);
+                break;
+        case RECORD_FUNCTION_TAG:
+                rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
+                break;
+        case RECORD_FUNCTON_TAG_LEN:
+                rc = seq_write_gcov_u32(seq, 2);
+                break;
+        case RECORD_FUNCTION_IDENT:
+                rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
+                break;
+        case RECORD_FUNCTION_CHECK:
+                rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+                break;
+        case RECORD_COUNT_TAG:
+                rc = seq_write_gcov_u32(seq,
+                        GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
+                break;
+        case RECORD_COUNT_LEN:
+                rc = seq_write_gcov_u32(seq,
+                                get_func(iter)->n_ctrs[iter->type] * 2);
+                break;
+        case RECORD_COUNT:
+                rc = seq_write_gcov_u64(seq,
+                        iter->info->counts[iter->type].
+                                values[iter->count + get_type(iter)->offset]);
+                break;
+        }
+        return rc;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000000..060073ebf7a6
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
+/*
+ *  Profiling infrastructure declarations.
+ *
+ *  This file is based on gcc-internal definitions. Data structures are
+ *  defined to be compatible with gcc counterparts. For a better
+ *  understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ */
+#ifndef GCOV_H
+#define GCOV_H GCOV_H
+#include <linux/types.h>
+/*
+ * Profiling data types used for gcc 3.4 and above - these are defined by
+ * gcc and need to be kept as close to the original definition as possible to
+ * remain compatible.
+ */
+#define GCOV_COUNTERS           5
+#define GCOV_DATA_MAGIC         ((unsigned int) 0x67636461)
+#define GCOV_TAG_FUNCTION       ((unsigned int) 0x01000000)
+#define GCOV_TAG_COUNTER_BASE   ((unsigned int) 0x01a10000)
+#define GCOV_TAG_FOR_COUNTER(count)                                     \
+        (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
+#if BITS_PER_LONG >= 64
+typedef long gcov_type;
+#else
+typedef long long gcov_type;
+#endif
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+        unsigned int ident;
+        unsigned int checksum;
+        unsigned int n_ctrs[0];
+};
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+        unsigned int    num;
+        gcov_type       *values;
+        void            (*merge)(gcov_type *, unsigned int);
+};
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+        unsigned int                    version;
+        struct gcov_info                *next;
+        unsigned int                    stamp;
+        const char                      *filename;
+        unsigned int                    n_functions;
+        const struct gcov_fn_info       *functions;
+        unsigned int                    ctr_mask;
+        struct gcov_ctr_info            counts[0];
+};
+/* Base interface. */
+enum gcov_action {
+        GCOV_ADD,
+        GCOV_REMOVE,
+};
+void gcov_event(enum gcov_action action, struct gcov_info *info);
+void gcov_enable_events(void);
+/* Iterator control. */
+struct seq_file;
+struct gcov_iterator;
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
+void gcov_iter_free(struct gcov_iterator *iter);
+void gcov_iter_start(struct gcov_iterator *iter);
+int gcov_iter_next(struct gcov_iterator *iter);
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+/* gcov_info control. */
+void gcov_info_reset(struct gcov_info *info);
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
+struct gcov_info *gcov_info_dup(struct gcov_info *info);
+void gcov_info_free(struct gcov_info *info);
+struct gcov_link {
+        enum {
+                OBJ_TREE,
+                SRC_TREE,
+        } dir;
+        const char *ext;
+};
+extern const struct gcov_link gcov_link[];
+#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+struct group_info *groups_alloc(int gidsetsize)
+{
+        struct group_info *group_info;
+        int nblocks;
+        int i;
+        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+        /* Make sure we always allocate at least one indirect block pointer */
+        nblocks = nblocks ? : 1;
+        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+        if (!group_info)
+                return NULL;
+        group_info->ngroups = gidsetsize;
+        group_info->nblocks = nblocks;
+        atomic_set(&group_info->usage, 1);
+        if (gidsetsize <= NGROUPS_SMALL)
+                group_info->blocks[0] = group_info->small_block;
+        else {
+                for (i = 0; i < nblocks; i++) {
+                        gid_t *b;
+                        b = (void *)__get_free_page(GFP_USER);
+                        if (!b)
+                                goto out_undo_partial_alloc;
+                        group_info->blocks[i] = b;
+                }
+        }
+        return group_info;
+out_undo_partial_alloc:
+        while (--i >= 0) {
+                free_page((unsigned long)group_info->blocks[i]);
+        }
+        kfree(group_info);
+        return NULL;
+}
+EXPORT_SYMBOL(groups_alloc);
+void groups_free(struct group_info *group_info)
+{
+        if (group_info->blocks[0] != group_info->small_block) {
+                int i;
+                for (i = 0; i < group_info->nblocks; i++)
+                        free_page((unsigned long)group_info->blocks[i]);
+        }
+        kfree(group_info);
+}
+EXPORT_SYMBOL(groups_free);
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+                          const struct group_info *group_info)
+{
+        int i;
+        unsigned int count = group_info->ngroups;
+        for (i = 0; i < group_info->nblocks; i++) {
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int len = cp_count * sizeof(*grouplist);
+                if (copy_to_user(grouplist, group_info->blocks[i], len))
+                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
+        }
+        return 0;
+}
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+{
+        int i;
+        unsigned int count = group_info->ngroups;
+        for (i = 0; i < group_info->nblocks; i++) {
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int len = cp_count * sizeof(*grouplist);
+                if (copy_from_user(group_info->blocks[i], grouplist, len))
+                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
+        }
+        return 0;
+}
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+        int base, max, stride;
+        int gidsetsize = group_info->ngroups;
+        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+                ; /* nothing */
+        stride /= 3;
+        while (stride) {
+                max = gidsetsize - stride;
+                for (base = 0; base < max; base++) {
+                        int left = base;
+                        int right = left + stride;
+                        gid_t tmp = GROUP_AT(group_info, right);
+                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+                                GROUP_AT(group_info, right) =
+                                    GROUP_AT(group_info, left);
+                                right = left;
+                                left -= stride;
+                        }
+                        GROUP_AT(group_info, right) = tmp;
+                }
+                stride /= 3;
+        }
+}
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+        unsigned int left, right;
+        if (!group_info)
+                return 0;
+        left = 0;
+        right = group_info->ngroups;
+        while (left < right) {
+                unsigned int mid = (left+right)/2;
+                int cmp = grp - GROUP_AT(group_info, mid);
+                if (cmp > 0)
+                        left = mid + 1;
+                else if (cmp < 0)
+                        right = mid;
+                else
+                        return 1;
+        }
+        return 0;
+}
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+        int retval;
+        retval = security_task_setgroups(group_info);
+        if (retval)
+                return retval;
+        put_group_info(new->group_info);
+        groups_sort(group_info);
+        get_group_info(group_info);
+        new->group_info = group_info;
+        return 0;
+}
+EXPORT_SYMBOL(set_groups);
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+        struct cred *new;
+        int ret;
+        new = prepare_creds();
+        if (!new)
+                return -ENOMEM;
+        ret = set_groups(new, group_info);
+        if (ret < 0) {
+                abort_creds(new);
+                return ret;
+        }
+        return commit_creds(new);
+}
+EXPORT_SYMBOL(set_current_groups);
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+        const struct cred *cred = current_cred();
+        int i;
+        if (gidsetsize < 0)
+                return -EINVAL;
+        /* no need to grab task_lock here; it cannot change */
+        i = cred->group_info->ngroups;
+        if (gidsetsize) {
+                if (i > gidsetsize) {
+                        i = -EINVAL;
+                        goto out;
+                }
+                if (groups_to_user(grouplist, cred->group_info)) {
+                        i = -EFAULT;
+                        goto out;
+                }
+        }
+out:
+        return i;
+}
+/*
+ *      SMP: Our groups are copy-on-write. We can set them safely
+ *      without another task interfering.
+ */
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+        struct group_info *group_info;
+        int retval;
+        if (!capable(CAP_SETGID))
+                return -EPERM;
+        if ((unsigned)gidsetsize > NGROUPS_MAX)
+                return -EINVAL;
+        group_info = groups_alloc(gidsetsize);
+        if (!group_info)
+                return -ENOMEM;
+        retval = groups_from_user(group_info, grouplist);
+        if (retval) {
+                put_group_info(group_info);
+                return retval;
+        }
+        retval = set_current_groups(group_info);
+        put_group_info(group_info);
+        return retval;
+}
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+        const struct cred *cred = current_cred();
+        int retval = 1;
+        if (grp != cred->fsgid)
+                retval = groups_search(cred->group_info, grp);
+        return retval;
+}
+EXPORT_SYMBOL(in_group_p);
+int in_egroup_p(gid_t grp)
+{
+        const struct cred *cred = current_cred();
+        int retval = 1;
+        if (grp != cred->egid)
+                retval = groups_search(cred->group_info, grp);
+        return retval;
+}
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c19583..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
 #include <asm/uaccess.h>
@@ -189,21 +191,65 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
        }
 }
+/*
+ * Get the preferred target CPU for NOHZ
+ */
+static int hrtimer_get_target(int this_cpu, int pinned)
+{
+#ifdef CONFIG_NO_HZ
+        if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+                int preferred_cpu = get_nohz_load_balancer();
+                if (preferred_cpu >= 0)
+                        return preferred_cpu;
+        }
+#endif
+        return this_cpu;
+}
+/*
+ * With HIGHRES=y we do not migrate the timer when it is expiring
+ * before the next event on the target cpu because we cannot reprogram
+ * the target cpu hardware and we would cause it to fire late.
+ *
+ * Called with cpu_base->lock of target cpu held.
+ */
+static int
+hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+        ktime_t expires;
+        if (!new_base->cpu_base->hres_active)
+                return 0;
+        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
+        return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+#else
+        return 0;
+#endif
+}
 /*
 * Switch the timer base to the current CPU when possible.
 */
 static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+                    int pinned)
 {
        struct hrtimer_clock_base *new_base;
        struct hrtimer_cpu_base *new_cpu_base;
+        int this_cpu = smp_processor_id();
+        int cpu = hrtimer_get_target(this_cpu, pinned);
-        new_cpu_base = &__get_cpu_var(hrtimer_bases);
+again:
+        new_cpu_base = &per_cpu(hrtimer_bases, cpu);
        new_base = &new_cpu_base->clock_base[base->index];
        if (base != new_base) {
                /*
-                 * We are trying to schedule the timer on the local CPU.
+                 * We are trying to move timer to new_base.
                 * However we can't change timer's base while it is running,
                 * so we keep it on the same CPU. No hassle vs. reprogramming
                 * the event source in the high resolution case. The softirq
@@ -218,6 +264,14 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
                timer->base = NULL;
                spin_unlock(&base->cpu_base->lock);
                spin_lock(&new_base->cpu_base->lock);
+                if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+                        cpu = this_cpu;
+                        spin_unlock(&new_base->cpu_base->lock);
+                        spin_lock(&base->cpu_base->lock);
+                        timer->base = base;
+                        goto again;
+                }
                timer->base = new_base;
        }
        return new_base;
@@ -235,7 +289,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        return base;
 }
-# define switch_hrtimer_base(t, b)      (b)
+# define switch_hrtimer_base(t, b, p)   (b)
 #endif  /* !CONFIG_SMP */
@@ -332,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
        return res;
 }
+EXPORT_SYMBOL_GPL(ktime_add_safe);
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 static struct debug_obj_descr hrtimer_debug_descr;
@@ -907,9 +963,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        ret = remove_hrtimer(timer, base);
        /* Switch the timer base, if necessary: */
-        new_base = switch_hrtimer_base(timer, base);
+        new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-        if (mode == HRTIMER_MODE_REL) {
+        if (mode & HRTIMER_MODE_REL) {
                tim = ktime_add_safe(tim, new_base->get_time());
                /*
                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -1226,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        expires_next.tv64 = KTIME_MAX;
+        spin_lock(&cpu_base->lock);
+        /*
+         * We set expires_next to KTIME_MAX here with cpu_base->lock
+         * held to prevent that a timer is enqueued in our queue via
+         * the migration code. This does not affect enqueueing of
+         * timers which run their callback and need to be requeued on
+         * this CPU.
+         */
+        cpu_base->expires_next.tv64 = KTIME_MAX;
        base = cpu_base->clock_base;
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                ktime_t basenow;
                struct rb_node *node;
-                spin_lock(&cpu_base->lock);
                basenow = ktime_add(now, base->offset);
                while ((node = base->first)) {
@@ -1266,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
                        __run_hrtimer(timer);
                }
-                spin_unlock(&cpu_base->lock);
                base++;
        }
+        /*
+         * Store the new expiry value so the migration code can verify
+         * against it.
+         */
        cpu_base->expires_next = expires_next;
+        spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
        if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
-        desc = irq_remap_to_desc(irq, desc);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
-        desc = irq_remap_to_desc(irq, desc);
        spin_unlock(&desc->lock);
 }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
                mask_ack_irq(desc, irq);
-                desc = irq_remap_to_desc(irq, desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
        /* Start handling the irq */
        if (desc->chip->ack)
                desc->chip->ack(irq);
-        desc = irq_remap_to_desc(irq, desc);
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        if (desc->chip->eoi) {
+        if (desc->chip->eoi)
                desc->chip->eoi(irq);
-                desc = irq_remap_to_desc(irq, desc);
-        }
 }
 void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-                if (desc->chip != &no_irq_chip) {
+                if (desc->chip != &no_irq_chip)
                        mask_ack_irq(desc, irq);
-                        desc = irq_remap_to_desc(irq, desc);
-                }
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744f..065205bdd920 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,14 +11,15 @@
 */
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <trace/irq.h>
 #include <linux/bootmem.h>
+#include <trace/events/irq.h>
 #include "internals.h"
@@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 static void __init init_irq_default_affinity(void)
 {
-        alloc_bootmem_cpumask_var(&irq_default_affinity);
+        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
        cpumask_setall(irq_default_affinity);
 }
 #else
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
        .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
-        int node;
        void *ptr;
-        node = cpu_to_node(cpu);
+        if (slab_is_available())
-        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+                ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                                   GFP_ATOMIC, node);
+        else
+                ptr = alloc_bootmem_node(NODE_DATA(node),
+                                nr * sizeof(*desc->kstat_irqs));
        /*
         * don't overwite if can not get new one
         * init_copy_kstat_irqs() could still use old one
         */
        if (ptr) {
-                printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
+                printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
-                         cpu, node);
                desc->kstat_irqs = ptr;
        }
 }
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
        spin_lock_init(&desc->lock);
        desc->irq = irq;
 #ifdef CONFIG_SMP
-        desc->cpu = cpu;
+        desc->node = node;
 #endif
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_kstat_irqs(desc, cpu, nr_cpu_ids);
+        init_kstat_irqs(desc, node, nr_cpu_ids);
        if (!desc->kstat_irqs) {
                printk(KERN_ERR "can not alloc kstat_irqs\n");
                BUG_ON(1);
        }
-        if (!init_alloc_desc_masks(desc, cpu, false)) {
+        if (!alloc_desc_masks(desc, node, false)) {
                printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
                BUG_ON(1);
        }
-        arch_init_chip_data(desc, cpu);
+        init_desc_masks(desc);
+        arch_init_chip_data(desc, node);
 }
 /*
@@ -146,6 +150,7 @@ int __init early_irq_init(void)
 {
        struct irq_desc *desc;
        int legacy_count;
+        int node;
        int i;
        init_irq_default_affinity();
@@ -156,20 +161,21 @@ int __init early_irq_init(void)
        desc = irq_desc_legacy;
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
+        node = first_online_node;
        /* allocate irq_desc_ptrs array based on nr_irqs */
-        irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+        irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
        /* allocate based on nr_cpu_ids */
-        /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-        kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+                                          sizeof(int), GFP_NOWAIT, node);
-                                          sizeof(int));
        for (i = 0; i < legacy_count; i++) {
                desc[i].irq = i;
                desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                init_alloc_desc_masks(&desc[i], 0, true);
+                alloc_desc_masks(&desc[i], node, true);
+                init_desc_masks(&desc[i]);
                irq_desc_ptrs[i] = desc + i;
        }
@@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return NULL;
 }
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
        unsigned long flags;
-        int node;
        if (irq >= nr_irqs) {
                WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
        if (desc)
                goto out_unlock;
-        node = cpu_to_node(cpu);
+        if (slab_is_available())
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+                desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
+        else
-                 irq, cpu, node);
+                desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
                printk(KERN_ERR "can not alloc irq_desc\n");
                BUG_ON(1);
        }
-        init_one_irq_desc(irq, desc, cpu);
+        init_one_irq_desc(irq, desc, node);
        irq_desc_ptrs[irq] = desc;
@@ -256,7 +263,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq = i;
-                init_alloc_desc_masks(&desc[i], 0, true);
+                alloc_desc_masks(&desc[i], 0, true);
+                init_desc_masks(&desc[i]);
                desc[i].kstat_irqs = kstat_irqs_all[i];
        }
        return arch_early_irq_init();
@@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        return irq_to_desc(irq);
 }
@@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
               "but no thread function available.", irq, action->name);
 }
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
 /**
 * handle_IRQ_event - irq action chain handler
 * @irq:        the interrupt number
@@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-                if (desc->chip->ack) {
+                if (desc->chip->ack)
                        desc->chip->ack(irq);
-                        /* get new one */
-                        desc = irq_remap_to_desc(irq, desc);
-                }
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
@@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
        }
        spin_lock(&desc->lock);
-        if (desc->chip->ack) {
+        if (desc->chip->ack)
                desc->chip->ack(irq);
-                desc = irq_remap_to_desc(irq, desc);
-        }
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
@@ -42,6 +42,8 @@ static inline void unregister_handler_proc(unsigned int irq,
 extern int irq_select_affinity_usr(unsigned int irq);
+extern void irq_set_thread_affinity(struct irq_desc *desc);
 /*
 * Debugging printout:
 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243..0ec9ed831737 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
        return 1;
 }
-static void
+/**
-irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+ *      irq_set_thread_affinity - Notify irq threads to adjust affinity
+ *      @desc:          irq descriptor which has affitnity changed
+ *
+ *      We just set IRQTF_AFFINITY and delegate the affinity setting
+ *      to the interrupt thread itself. We can not call
+ *      set_cpus_allowed_ptr() here as we hold desc->lock and this
+ *      code can be called from hard interrupt context.
+ */
+void irq_set_thread_affinity(struct irq_desc *desc)
 {
        struct irqaction *action = desc->action;
        while (action) {
                if (action->thread)
-                        set_cpus_allowed_ptr(action->thread, cpumask);
+                        set_bit(IRQTF_AFFINITY, &action->thread_flags);
                action = action->next;
        }
 }
@@ -109,17 +117,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PCNTXT)
+        if (desc->status & IRQ_MOVE_PCNTXT) {
-                desc->chip->set_affinity(irq, cpumask);
+                if (!desc->chip->set_affinity(irq, cpumask)) {
+                        cpumask_copy(desc->affinity, cpumask);
+                        irq_set_thread_affinity(desc);
+                }
+        }
        else {
                desc->status |= IRQ_MOVE_PENDING;
                cpumask_copy(desc->pending_mask, cpumask);
        }
 #else
-        cpumask_copy(desc->affinity, cpumask);
+        if (!desc->chip->set_affinity(irq, cpumask)) {
-        desc->chip->set_affinity(irq, cpumask);
+                cpumask_copy(desc->affinity, cpumask);
+                irq_set_thread_affinity(desc);
+        }
 #endif
-        irq_set_thread_affinity(desc, cpumask);
        desc->status |= IRQ_AFFINITY_SET;
        spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
@@ -171,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
        spin_lock_irqsave(&desc->lock, flags);
        ret = setup_affinity(irq, desc);
        if (!ret)
-                irq_set_thread_affinity(desc, desc->affinity);
+                irq_set_thread_affinity(desc);
        spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
@@ -438,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
        return -1;
 }
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+        cpumask_var_t mask;
+        if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+                return;
+        /*
+         * In case we are out of memory we set IRQTF_AFFINITY again and
+         * try again next time
+         */
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+                set_bit(IRQTF_AFFINITY, &action->thread_flags);
+                return;
+        }
+        spin_lock_irq(&desc->lock);
+        cpumask_copy(mask, desc->affinity);
+        spin_unlock_irq(&desc->lock);
+        set_cpus_allowed_ptr(current, mask);
+        free_cpumask_var(mask);
+}
+#else
+static inline void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
 /*
 * Interrupt handler thread
 */
@@ -453,6 +499,8 @@ static int irq_thread(void *data)
        while (!irq_wait_for_interrupt(action)) {
+                irq_thread_check_affinity(desc, action);
                atomic_inc(&desc->threads_active);
                spin_lock_irq(&desc->lock);
@@ -559,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                get_task_struct(t);
                new->thread = t;
-                wake_up_process(t);
        }
        /*
@@ -642,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                                (int)(new->flags & IRQF_TRIGGER_MASK));
        }
+        new->irq = irq;
        *old_ptr = new;
        /* Reset broken irq detection when installing new handler */
@@ -659,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        spin_unlock_irqrestore(&desc->lock, flags);
-        new->irq = irq;
+        /*
+         * Strictly no need to wake it up, but hung_task complains
+         * when no hard interrupt wakes the thread up.
+         */
+        if (new->thread)
+                wake_up_process(new->thread);
        register_irq_proc(irq, desc);
        new->dir = NULL;
        register_handler_proc(irq, new);
@@ -713,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action, **action_ptr;
-        struct task_struct *irqthread;
        unsigned long flags;
        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -761,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                        desc->chip->disable(irq);
        }
-        irqthread = action->thread;
-        action->thread = NULL;
        spin_unlock_irqrestore(&desc->lock, flags);
        unregister_handler_proc(irq, action);
@@ -771,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Make sure it's not being used on another CPU: */
        synchronize_irq(irq);
-        if (irqthread) {
-                if (!test_bit(IRQTF_DIED, &action->thread_flags))
-                        kthread_stop(irqthread);
-                put_task_struct(irqthread);
-        }
 #ifdef CONFIG_DEBUG_SHIRQ
        /*
         * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -792,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                local_irq_restore(flags);
        }
 #endif
+        if (action->thread) {
+                if (!test_bit(IRQTF_DIED, &action->thread_flags))
+                        kthread_stop(action->thread);
+                put_task_struct(action->thread);
+        }
        return action;
 }
@@ -851,7 +902,7 @@ EXPORT_SYMBOL(free_irq);
 *      still called in hard interrupt context and has to check
 *      whether the interrupt originates from the device. If yes it
 *      needs to disable the interrupt on the device and return
- *      IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *      IRQ_WAKE_THREAD which will wake up the handler thread and run
 *      @thread_fn. This split handler design is necessary to support
 *      shared interrupts.
 *
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
 #include <linux/irq.h>
+#include <linux/interrupt.h>
+#include "internals.h"
 void move_masked_irq(int irq)
 {
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
         * masking the irqs.
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                   < nr_cpu_ids)) {
+                   < nr_cpu_ids))
-                cpumask_and(desc->affinity,
+                if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
-                            desc->pending_mask, cpu_online_mask);
+                        cpumask_copy(desc->affinity, desc->pending_mask);
-                desc->chip->set_affinity(irq, desc->affinity);
+                        irq_set_thread_affinity(desc);
-        }
+                }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
 static void init_copy_kstat_irqs(struct irq_desc *old_desc,
                                 struct irq_desc *desc,
-                                 int cpu, int nr)
+                                 int node, int nr)
 {
-        init_kstat_irqs(desc, cpu, nr);
+        init_kstat_irqs(desc, node, nr);
        if (desc->kstat_irqs != old_desc->kstat_irqs)
                memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                 struct irq_desc *desc, int cpu)
+                 struct irq_desc *desc, int node)
 {
        memcpy(desc, old_desc, sizeof(struct irq_desc));
-        if (!init_alloc_desc_masks(desc, cpu, false)) {
+        if (!alloc_desc_masks(desc, node, false)) {
                printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
                                "for migration.\n", irq);
                return false;
        }
        spin_lock_init(&desc->lock);
-        desc->cpu = cpu;
+        desc->node = node;
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
        init_copy_desc_masks(old_desc, desc);
-        arch_init_copy_chip_data(old_desc, desc, cpu);
+        arch_init_copy_chip_data(old_desc, desc, node);
        return true;
 }
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                                int cpu)
+                                                int node)
 {
        struct irq_desc *desc;
        unsigned int irq;
        unsigned long flags;
-        int node;
        irq = old_desc->irq;
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        if (desc && old_desc != desc)
                goto out_unlock;
-        node = cpu_to_node(cpu);
        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
        if (!desc) {
                printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                desc = old_desc;
                goto out_unlock;
        }
-        if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+        if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
                /* still use old one */
                kfree(desc);
                desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        /* free the old one */
        free_one_irq_desc(old_desc, desc);
-        spin_unlock(&old_desc->lock);
        kfree(old_desc);
-        spin_lock(&desc->lock);
        return desc;
@@ -109,24 +105,14 @@ out_unlock:
        return desc;
 }
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-        int old_cpu;
+        /* those static or target node is -1, do not move them */
-        int node, old_node;
+        if (desc->irq < NR_IRQS_LEGACY || node == -1)
-        /* those all static, do move them */
-        if (desc->irq < NR_IRQS_LEGACY)
                return desc;
-        old_cpu = desc->cpu;
+        if (desc->node != node)
-        if (old_cpu != cpu) {
+                desc = __real_move_irq_desc(desc, node);
-                node = cpu_to_node(cpu);
-                old_node = cpu_to_node(old_cpu);
-                if (old_node != node)
-                        desc = __real_move_irq_desc(desc, cpu);
-                else
-                        desc->cpu = cpu;
-        }
        return desc;
 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc7..3a29dbe7898e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
 #define all_var 0
 #endif
-/* These will be re-linked against their real values during the second link stage */
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
 extern const unsigned long kallsyms_addresses[] __attribute__((weak));
 extern const u8 kallsyms_names[] __attribute__((weak));
-/* tell the compiler that the count isn't in the small data section if the arch
+/*
- * has one (eg: FRV)
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
 */
 extern const unsigned long kallsyms_num_syms
 __attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
        return is_kernel_text(addr) || is_kernel_inittext(addr);
 }
-/* expand a compressed symbol data into the resulting uncompressed string,
+/*
-   given the offset to where the symbol is in the compressed stream */
+ * Expand a compressed symbol data into the resulting uncompressed string,
+ * given the offset to where the symbol is in the compressed stream.
+ */
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 {
        int len, skipped_first = 0;
        const u8 *tptr, *data;
-        /* get the compressed symbol length from the first symbol byte */
+        /* Get the compressed symbol length from the first symbol byte. */
        data = &kallsyms_names[off];
        len = *data;
        data++;
-        /* update the offset to return the offset for the next symbol on
+        /*
-         * the compressed stream */
+         * Update the offset to return the offset for the next symbol on
+         * the compressed stream.
+         */
        off += len + 1;
-        /* for every byte on the compressed symbol data, copy the table
+        /*
-           entry for that byte */
+         * For every byte on the compressed symbol data, copy the table
-        while(len) {
+         * entry for that byte.
-                tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+         */
+        while (len) {
+                tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
                data++;
                len--;
                while (*tptr) {
-                        if(skipped_first) {
+                        if (skipped_first) {
                                *result = *tptr;
                                result++;
                        } else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
        *result = '\0';
-        /* return to offset to the next symbol */
+        /* Return to offset to the next symbol. */
        return off;
 }
-/* get symbol type information. This is encoded as a single char at the
+/*
- * begining of the symbol name */
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
 static char kallsyms_get_symbol_type(unsigned int off)
 {
-        /* get just the first code, look it up in the token table, and return the
+        /*
-         * first char from this token */
+         * Get just the first code, look it up in the token table,
-        return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+         * and return the first char from this token.
+         */
+        return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
 }
-/* find the offset on the compressed stream given and index in the
+/*
- * kallsyms array */
+ * Find the offset on the compressed stream given and index in the
+ * kallsyms array.
+ */
 static unsigned int get_symbol_offset(unsigned long pos)
 {
        const u8 *name;
        int i;
-        /* use the closest marker we have. We have markers every 256 positions,
+        /*
-         * so that should be close enough */
+         * Use the closest marker we have. We have markers every 256 positions,
-        name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+         * so that should be close enough.
+         */
+        name = &kallsyms_names[kallsyms_markers[pos >> 8]];
-        /* sequentially scan all the symbols up to the point we're searching for.
+        /*
-         * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
+         * Sequentially scan all the symbols up to the point we're searching
-         * just need to add the len to the current pointer for every symbol we
+         * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
-         * wish to skip */
+         * so we just need to add the len to the current pointer for every
-        for(i = 0; i < (pos&0xFF); i++)
+         * symbol we wish to skip.
+         */
+        for (i = 0; i < (pos & 0xFF); i++)
                name = name + (*name) + 1;
        return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
        /* This kernel should never had been booted. */
        BUG_ON(!kallsyms_addresses);
-        /* do a binary search on the sorted kallsyms_addresses array */
+        /* Do a binary search on the sorted kallsyms_addresses array. */
        low = 0;
        high = kallsyms_num_syms;
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
        }
        /*
-         * search for the first aliased symbol. Aliased
+         * Search for the first aliased symbol. Aliased
-         * symbols are symbols with the same address
+         * symbols are symbols with the same address.
         */
        while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
                --low;
        symbol_start = kallsyms_addresses[low];
-        /* Search for next non-aliased symbol */
+        /* Search for next non-aliased symbol. */
        for (i = low + 1; i < kallsyms_num_syms; i++) {
                if (kallsyms_addresses[i] > symbol_start) {
                        symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
                }
        }
-        /* if we found no next symbol, we use the end of the section */
+        /* If we found no next symbol, we use the end of the section. */
        if (!symbol_end) {
                if (is_kernel_inittext(addr))
                        symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 /*
 * Lookup an address
- * - modname is set to NULL if it's in the kernel
+ * - modname is set to NULL if it's in the kernel.
- * - we guarantee that the returned name is valid until we reschedule even if
+ * - We guarantee that the returned name is valid until we reschedule even if.
- *   it resides in a module
+ *   It resides in a module.
- * - we also guarantee that modname will be valid until rescheduled
+ * - We also guarantee that modname will be valid until rescheduled.
 */
 const char *kallsyms_lookup(unsigned long addr,
                            unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
                return namebuf;
        }
-        /* see if it's in a module */
+        /* See if it's in a module. */
        return module_address_lookup(addr, symbolsize, offset, modname,
                                     namebuf);
 }
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
                kallsyms_expand_symbol(get_symbol_offset(pos), symname);
                return 0;
        }
-        /* see if it's in a module */
+        /* See if it's in a module. */
        return lookup_module_symbol_name(addr, symname);
 }
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
                modname[0] = '\0';
                return 0;
        }
-        /* see if it's in a module */
+        /* See if it's in a module. */
        return lookup_module_symbol_attrs(addr, size, offset, modname, name);
 }
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
        return len;
 }
+EXPORT_SYMBOL_GPL(sprint_symbol);
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
        printk(fmt, buffer);
 }
+EXPORT_SYMBOL(__print_symbol);
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
-struct kallsym_iter
+struct kallsym_iter {
-{
        loff_t pos;
        unsigned long value;
-        unsigned int nameoff; /* If iterating in core kernel symbols */
+        unsigned int nameoff; /* If iterating in core kernel symbols. */
        char type;
        char name[KSYM_NAME_LEN];
        char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
                iter->pos = pos;
                return get_ksymbol_mod(iter);
        }
-        
        /* If we're not on the desired position, reset to new position. */
        if (pos != iter->pos)
                reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
 {
        struct kallsym_iter *iter = m->private;
-        /* Some debugging symbols have no name.  Ignore them. */ 
+        /* Some debugging symbols have no name.  Ignore them. */
        if (!iter->name[0])
                return 0;
        if (iter->module_name[0]) {
                char type;
-                /* Label it "global" if it is exported,
+                /*
-                 * "local" if not exported. */
+                 * Label it "global" if it is exported,
+                 * "local" if not exported.
+                 */
                type = iter->exported ? toupper(iter->type) :
                                        tolower(iter->type);
                seq_printf(m, "%0*lx %c %s\t[%s]\n",
-                           (int)(2*sizeof(void*)),
+                           (int)(2 * sizeof(void *)),
                           iter->value, type, iter->name, iter->module_name);
        } else
                seq_printf(m, "%0*lx %c %s\n",
-                           (int)(2*sizeof(void*)),
+                           (int)(2 * sizeof(void *)),
                           iter->value, iter->type, iter->name);
        return 0;
 }
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
 static int kallsyms_open(struct inode *inode, struct file *file)
 {
-        /* We keep iterator in m->private, since normal case is to
+        /*
+         * We keep iterator in m->private, since normal case is to
         * s_start from where we left off, so we avoid doing
-         * using get_symbol_offset for every symbol */
+         * using get_symbol_offset for every symbol.
+         */
        struct kallsym_iter *iter;
        int ret;
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
        proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
        return 0;
 }
-__initcall(kallsyms_init);
+device_initcall(kallsyms_init);
-EXPORT_SYMBOL(__print_symbol);
-EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e4983770913b..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
        } while (*cur++ == ',');
        if (*crash_size > 0) {
-                while (*cur != ' ' && *cur != '@')
+                while (*cur && *cur != ' ' && *cur != '@')
                        cur++;
                if (*cur == '@') {
                        cur++;
@@ -1448,17 +1448,17 @@ int kernel_kexec(void)
                        goto Restore_console;
                }
                suspend_console();
-                error = device_suspend(PMSG_FREEZE);
+                error = dpm_suspend_start(PMSG_FREEZE);
                if (error)
                        goto Resume_console;
-                /* At this point, device_suspend() has been called,
+                /* At this point, dpm_suspend_start() has been called,
-                 * but *not* device_power_down(). We *must*
+                 * but *not* dpm_suspend_noirq(). We *must* call
-                 * device_power_down() now.  Otherwise, drivers for
+                 * dpm_suspend_noirq() now.  Otherwise, drivers for
                 * some devices (e.g. interrupt controllers) become
                 * desynchronized with the actual state of the
                 * hardware at resume time, and evil weirdness ensues.
                 */
-                error = device_power_down(PMSG_FREEZE);
+                error = dpm_suspend_noirq(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
                error = disable_nonboot_cpus();
@@ -1486,9 +1486,9 @@ int kernel_kexec(void)
                local_irq_enable();
 Enable_cpus:
                enable_nonboot_cpus();
-                device_power_up(PMSG_RESTORE);
+                dpm_resume_noirq(PMSG_RESTORE);
 Resume_devices:
-                device_resume(PMSG_RESTORE);
+                dpm_resume_end(PMSG_RESTORE);
 Resume_console:
                resume_console();
                thaw_processes();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
        /*
         * round up to the next power of 2, since our 'let the indices
-         * wrap' tachnique works only in this case.
+         * wrap' technique works only in this case.
         */
-        if (size & (size - 1)) {
+        if (!is_power_of_2(size)) {
                BUG_ON(size > 0x80000000);
                size = roundup_pow_of_two(size);
        }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
 #include <linux/unistd.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
-#include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..0540948e29ab 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos, *next;
-        int safety;
        /* Ensure no-one is preepmted on the garbages */
-        mutex_unlock(&kprobe_insn_mutex);
+        if (check_safety())
-        safety = check_safety();
-        mutex_lock(&kprobe_insn_mutex);
-        if (safety != 0)
                return -EAGAIN;
        hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
        p->addr = addr;
        preempt_disable();
-        if (!__kernel_text_address((unsigned long) p->addr) ||
+        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr)) {
                preempt_enable();
                return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf8519abf..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,11 +9,12 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 #define KTHREAD_NICE_LEVEL (-5)
@@ -21,15 +22,11 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
 struct kthread_create_info
 {
        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
-        struct completion started;
        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
@@ -38,17 +35,13 @@ struct kthread_create_info
        struct list_head list;
 };
-struct kthread_stop_info
+struct kthread {
-{
+        int should_stop;
-        struct task_struct *k;
+        struct completion exited;
-        int err;
-        struct completion done;
 };
-/* Thread stopping is done by setthing this var: lock serializes
+#define to_kthread(tsk) \
- * multiple kthread_stop calls. */
+        container_of((tsk)->vfork_done, struct kthread, exited)
-static DEFINE_MUTEX(kthread_stop_lock);
-static struct kthread_stop_info kthread_stop_info;
 /**
 * kthread_should_stop - should this kthread return now?
@@ -59,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
 */
 int kthread_should_stop(void)
 {
-        return (kthread_stop_info.k == current);
+        return to_kthread(current)->should_stop;
 }
 EXPORT_SYMBOL(kthread_should_stop);
 static int kthread(void *_create)
 {
+        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
-        int (*threadfn)(void *data);
+        int (*threadfn)(void *data) = create->threadfn;
-        void *data;
+        void *data = create->data;
-        int ret = -EINTR;
+        struct kthread self;
+        int ret;
-        /* Copy data: it's on kthread's stack */
+        self.should_stop = 0;
-        threadfn = create->threadfn;
+        init_completion(&self.exited);
-        data = create->data;
+        current->vfork_done = &self.exited;
        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
-        complete(&create->started);
+        complete(&create->done);
        schedule();
-        if (!kthread_should_stop())
+        ret = -EINTR;
+        if (!self.should_stop)
                ret = threadfn(data);
-        /* It might have exited on its own, w/o kthread_stop.  Check. */
+        /* we can't just return, we must preserve "self" on stack */
-        if (kthread_should_stop()) {
+        do_exit(ret);
-                kthread_stop_info.err = ret;
-                complete(&kthread_stop_info.done);
-        }
-        return 0;
 }
 static void create_kthread(struct kthread_create_info *create)
@@ -97,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
-        if (pid < 0)
+        if (pid < 0) {
                create->result = ERR_PTR(pid);
-        else
+                complete(&create->done);
-                wait_for_completion(&create->started);
+        }
-        complete(&create->done);
 }
 /**
@@ -132,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        create.threadfn = threadfn;
        create.data = data;
-        init_completion(&create.started);
        init_completion(&create.done);
        spin_lock(&kthread_create_lock);
@@ -190,40 +180,34 @@ EXPORT_SYMBOL(kthread_bind);
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
- * waits for it to exit.  Your threadfn() must not call do_exit()
+ * waits for it to exit. This can also be called after kthread_create()
- * itself if you use this function!  This can also be called after
+ * instead of calling wake_up_process(): the thread will exit without
- * kthread_create() instead of calling wake_up_process(): the thread
+ * calling threadfn().
- * will exit without calling threadfn().
+ *
+ * If threadfn() may call do_exit() itself, the caller must ensure
+ * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
 int kthread_stop(struct task_struct *k)
 {
+        struct kthread *kthread;
        int ret;
-        mutex_lock(&kthread_stop_lock);
-        /* It could exit after stop_info.k set, but before wake_up_process. */
-        get_task_struct(k);
        trace_sched_kthread_stop(k);
+        get_task_struct(k);
-        /* Must init completion *before* thread sees kthread_stop_info.k */
+        kthread = to_kthread(k);
-        init_completion(&kthread_stop_info.done);
+        barrier(); /* it might have exited */
-        smp_wmb();
+        if (k->vfork_done != NULL) {
+                kthread->should_stop = 1;
+                wake_up_process(k);
+                wait_for_completion(&kthread->exited);
+        }
+        ret = k->exit_code;
-        /* Now set kthread_should_stop() to true, and wake it up. */
-        kthread_stop_info.k = k;
-        wake_up_process(k);
        put_task_struct(k);
-        /* Once it dies, reset stop ptr, gather result and we're done. */
-        wait_for_completion(&kthread_stop_info.done);
-        kthread_stop_info.k = NULL;
-        ret = kthread_stop_info.err;
-        mutex_unlock(&kthread_stop_lock);
        trace_sched_kthread_stop_ret(ret);
        return ret;
@@ -239,6 +223,7 @@ int kthreadd(void *unused)
        ignore_signals(tsk);
        set_user_nice(tsk, KTHREAD_NICE_LEVEL);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
+        set_mems_allowed(node_possible_map);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index accb40cdb12a..8bbeef996c76 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
-#include <trace/lockdep.h>
 #include <asm/sections.h>
 #include "lockdep_internals.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/lockdep.h>
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
-DEFINE_TRACE(lock_acquire);
 /*
 * We are not always called with irqs disabled - do that here,
 * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
-DEFINE_TRACE(lock_release);
 void lock_release(struct lockdep_map *lock, int nested,
                          unsigned long ip)
 {
@@ -3105,6 +3103,8 @@ found_it:
                hlock->holdtime_stamp = now;
        }
+        trace_lock_acquired(lock, ip, waittime);
        stats = get_lock_stats(hlock_class(hlock));
        if (waittime) {
                if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
        lock->ip = ip;
 }
-DEFINE_TRACE(lock_contended);
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
-DEFINE_TRACE(lock_acquired);
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
-        trace_lock_acquired(lock, ip);
        if (unlikely(!lock_stat))
                return;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..e94caa666dba 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
                    &proc_lockdep_stats_operations);
 #ifdef CONFIG_LOCK_STAT
-        proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
+        proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+                    &proc_lock_stat_operations);
 #endif
        return 0;
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d95..eccb561dd8a3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
 */
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
@@ -52,6 +53,7 @@
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/percpu.h>
+#include <linux/kmemleak.h>
 #if 0
 #define DEBUGP printk
@@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+/* Block module loading/unloading? */
+int modules_disabled = 0;
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
        unsigned long extra;
        unsigned int i;
        void *ptr;
+        int cpu;
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
                        if (!split_block(i, size))
                                return NULL;
+                /* add the per-cpu scanning areas */
+                for_each_possible_cpu(cpu)
+                        kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
+                                       GFP_KERNEL);
                /* Mark allocated */
                pcpu_size[i] = -pcpu_size[i];
                return ptr;
@@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme)
 {
        unsigned int i;
        void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+        int cpu;
        /* First entry is core kernel percpu data. */
        for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme)
        BUG();
 free:
+        /* remove the per-cpu scanning areas */
+        for_each_possible_cpu(cpu)
+                kmemleak_free(freeme + per_cpu_offset(cpu));
        /* Merge with previous? */
        if (pcpu_size[i-1] >= 0) {
                pcpu_size[i-1] += pcpu_size[i];
@@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        char name[MODULE_NAME_LEN];
        int ret, forced = 0;
-        if (!capable(CAP_SYS_MODULE))
+        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;
        if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -893,16 +909,18 @@ void __symbol_put(const char *symbol)
 }
 EXPORT_SYMBOL(__symbol_put);
+/* Note this assumes addr is a function, which it currently always is. */
 void symbol_put_addr(void *addr)
 {
        struct module *modaddr;
+        unsigned long a = (unsigned long)dereference_function_descriptor(addr);
-        if (core_kernel_text((unsigned long)addr))
+        if (core_kernel_text(a))
                return;
        /* module_text_address is safe here: we're supposed to have reference
         * to module from symbol_get, so it can't go away. */
-        modaddr = __module_text_address((unsigned long)addr);
+        modaddr = __module_text_address(a);
        BUG_ON(!modaddr);
        module_put(modaddr);
 }
@@ -1052,7 +1070,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
        const unsigned long *crc;
-        if (!find_symbol("module_layout", NULL, &crc, true, false))
+        if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+                         &crc, true, false))
                BUG();
        return check_version(sechdrs, versindex, "module_layout", mod, crc);
 }
@@ -1489,9 +1508,6 @@ static void free_module(struct module *mod)
        /* Free any allocated parameters. */
        destroy_params(mod->kp, mod->num_kp);
-        /* release any pointers to mcount in this module */
-        ftrace_release(mod->module_core, mod->core_size);
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
@@ -1878,6 +1894,36 @@ static void *module_alloc_update_bounds(unsigned long size)
        return ret;
 }
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+                                 Elf_Shdr *sechdrs, char *secstrings)
+{
+        unsigned int i;
+        /* only scan the sections containing data */
+        kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+                           (unsigned long)mod->module_core,
+                           sizeof(struct module), GFP_KERNEL);
+        for (i = 1; i < hdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
+                    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+                        continue;
+                kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+                                   (unsigned long)mod->module_core,
+                                   sechdrs[i].sh_size, GFP_KERNEL);
+        }
+}
+#else
+static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+                                        Elf_Shdr *sechdrs, char *secstrings)
+{
+}
+#endif
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
 static noinline struct module *load_module(void __user *umod,
@@ -1892,11 +1938,9 @@ static noinline struct module *load_module(void __user *umod,
        unsigned int symindex = 0;
        unsigned int strindex = 0;
        unsigned int modindex, versindex, infoindex, pcpuindex;
-        unsigned int num_mcount;
        struct module *mod;
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-        unsigned long *mseg;
        mm_segment_t old_fs;
        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2050,6 +2094,12 @@ static noinline struct module *load_module(void __user *umod,
        /* Do the allocs. */
        ptr = module_alloc_update_bounds(mod->core_size);
+        /*
+         * The pointer to this block is stored in the module structure
+         * which is inside the block. Just mark it as not being a
+         * leak.
+         */
+        kmemleak_not_leak(ptr);
        if (!ptr) {
                err = -ENOMEM;
                goto free_percpu;
@@ -2058,6 +2108,13 @@ static noinline struct module *load_module(void __user *umod,
        mod->module_core = ptr;
        ptr = module_alloc_update_bounds(mod->init_size);
+        /*
+         * The pointer to this block is stored in the module structure
+         * which is inside the block. This block doesn't need to be
+         * scanned as it contains data and code that will be freed
+         * after the module is initialized.
+         */
+        kmemleak_ignore(ptr);
        if (!ptr && mod->init_size) {
                err = -ENOMEM;
                goto free_core;
@@ -2088,6 +2145,7 @@ static noinline struct module *load_module(void __user *umod,
        }
        /* Module has been moved. */
        mod = (void *)sechdrs[modindex].sh_addr;
+        kmemleak_load_module(mod, hdr, sechdrs, secstrings);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
        mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
@@ -2161,6 +2219,10 @@ static noinline struct module *load_module(void __user *umod,
        mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
                                            "__kcrctab_unused_gpl");
 #endif
+#ifdef CONFIG_CONSTRUCTORS
+        mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
+                                  sizeof(*mod->ctors), &mod->num_ctors);
+#endif
 #ifdef CONFIG_MARKERS
        mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2172,7 +2234,19 @@ static noinline struct module *load_module(void __user *umod,
                                        sizeof(*mod->tracepoints),
                                        &mod->num_tracepoints);
 #endif
+#ifdef CONFIG_EVENT_TRACING
+        mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+                                         "_ftrace_events",
+                                         sizeof(*mod->trace_events),
+                                         &mod->num_trace_events);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+        /* sechdrs[0].sh_size is always zero */
+        mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+                                             "__mcount_loc",
+                                             sizeof(*mod->ftrace_callsites),
+                                             &mod->num_ftrace_callsites);
+#endif
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
            || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2311,6 @@ static noinline struct module *load_module(void __user *umod,
                        dynamic_debug_setup(debug, num_debug);
        }
-        /* sechdrs[0].sh_size is always zero */
-        mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-                            sizeof(*mseg), &num_mcount);
-        ftrace_init_module(mod, mseg, mseg + num_mcount);
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
                goto cleanup;
@@ -2286,7 +2355,8 @@ static noinline struct module *load_module(void __user *umod,
        if (err < 0)
                goto unlink;
        add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
-        add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+        if (mod->sect_attrs)
+                add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
        /* Get rid of temporary copy */
        vfree(hdr);
@@ -2302,7 +2372,6 @@ static noinline struct module *load_module(void __user *umod,
 cleanup:
        kobject_del(&mod->mkobj.kobj);
        kobject_put(&mod->mkobj.kobj);
-        ftrace_release(mod->module_core, mod->core_size);
 free_unload:
        module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
@@ -2328,6 +2397,17 @@ static noinline struct module *load_module(void __user *umod,
        goto free_hdr;
 }
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+        unsigned long i;
+        for (i = 0; i < mod->num_ctors; i++)
+                mod->ctors[i]();
+#endif
+}
 /* This is where the real work happens */
 SYSCALL_DEFINE3(init_module, void __user *, umod,
                unsigned long, len, const char __user *, uargs)
@@ -2336,7 +2416,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        int ret = 0;
        /* Must have permission */
-        if (!capable(CAP_SYS_MODULE))
+        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;
        /* Only one module load at a time, please */
@@ -2356,6 +2436,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                        MODULE_STATE_COMING, mod);
+        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
                ret = do_one_initcall(mod->init);
@@ -2374,9 +2455,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
                return ret;
        }
        if (ret > 0) {
-                printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+                printk(KERN_WARNING
-                                    "it should follow 0/-E convention\n"
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-                       KERN_WARNING "%s: loading module anyway...\n",
+"%s: loading module anyway...\n",
                       __func__, mod->name, ret,
                       __func__);
                dump_stack();
@@ -2394,6 +2475,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
+        trim_init_extable(mod);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
@@ -2837,7 +2919,7 @@ void print_modules(void)
        struct module *mod;
        char buf[8];
-        printk("Modules linked in:");
+        printk(KERN_DEFAULT "Modules linked in:");
        /* Most callers should already have preempt disabled, but make sure */
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..947b3ad551f8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
 *
 * This function is similar to (but not equivalent to) down().
 */
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
 {
        might_sleep();
        /*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                /* didnt get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
-                __schedule();
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
        return ret;
 }
 EXPORT_SYMBOL(mutex_trylock);
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+        /* dec if we can't possibly hit 0 */
+        if (atomic_add_unless(cnt, -1, 1))
+                return 0;
+        /* we might hit 0, so take the lock */
+        mutex_lock(lock);
+        if (!atomic_dec_and_test(cnt)) {
+                /* when we actually did the dec, we didn't hit 0 */
+                mutex_unlock(lock);
+                return 0;
+        }
+        /* we hit 0, and we hold the lock */
+        return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0c..09b4ff9711b2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
-/*
+static inline struct nsproxy *create_nsproxy(void)
- * creates a copy of "orig" with refcount 1.
- */
-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
 {
-        struct nsproxy *ns;
+        struct nsproxy *nsproxy;
-        ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+        nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
-        if (ns) {
+        if (nsproxy)
-                memcpy(ns, orig, sizeof(struct nsproxy));
+                atomic_set(&nsproxy->count, 1);
-                atomic_set(&ns->count, 1);
+        return nsproxy;
-        }
-        return ns;
 }
 /*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
        struct nsproxy *new_nsp;
        int err;
-        new_nsp = clone_nsproxy(tsk->nsproxy);
+        new_nsp = create_nsproxy();
        if (!new_nsp)
                return ERR_PTR(-ENOMEM);
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
 */
 void oops_enter(void)
 {
+        tracing_off();
        /* can't trust the integrity of the kernel anymore: */
        debug_locks_off();
        do_oops_enter_exit();
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd2..7f6912ced2ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
-/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
-#define KPARAM_KMALLOCED        0x80000000
 #if 0
 #define DEBUGP printk
 #else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
                return -ENOSPC;
        }
-        if (kp->perm & KPARAM_KMALLOCED)
+        if (kp->flags & KPARAM_KMALLOCED)
                kfree(*(char **)kp->arg);
        /* This is a hack.  We can't need to strdup in early boot, and we
         * don't need to; this mangled commandline is preserved. */
        if (slab_is_available()) {
-                kp->perm |= KPARAM_KMALLOCED;
+                kp->flags |= KPARAM_KMALLOCED;
                *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
                if (!kp->arg)
                        return -ENOMEM;
@@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
        return sprintf(buffer, "%s", *((char **)kp->arg));
 }
+/* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, struct kernel_param *kp)
 {
+        bool v;
        /* No equals means "set"... */
        if (!val) val = "1";
        /* One of =[yYnN01] */
        switch (val[0]) {
        case 'y': case 'Y': case '1':
-                *(int *)kp->arg = 1;
+                v = true;
-                return 0;
+                break;
        case 'n': case 'N': case '0':
-                *(int *)kp->arg = 0;
+                v = false;
-                return 0;
+                break;
+        default:
+                return -EINVAL;
        }
-        return -EINVAL;
+        if (kp->flags & KPARAM_ISBOOL)
+                *(bool *)kp->arg = v;
+        else
+                *(int *)kp->arg = v;
+        return 0;
 }
 int param_get_bool(char *buffer, struct kernel_param *kp)
 {
+        bool val;
+        if (kp->flags & KPARAM_ISBOOL)
+                val = *(bool *)kp->arg;
+        else
+                val = *(int *)kp->arg;
        /* Y and N chosen as being relatively non-coder friendly */
-        return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+        return sprintf(buffer, "%c", val ? 'Y' : 'N');
 }
+/* This one must be bool. */
 int param_set_invbool(const char *val, struct kernel_param *kp)
 {
-        int boolval, ret;
+        int ret;
+        bool boolval;
        struct kernel_param dummy;
        dummy.arg = &boolval;
+        dummy.flags = KPARAM_ISBOOL;
        ret = param_set_bool(val, &dummy);
        if (ret == 0)
-                *(int *)kp->arg = !boolval;
+                *(bool *)kp->arg = !boolval;
        return ret;
 }
 int param_get_invbool(char *buffer, struct kernel_param *kp)
 {
-        return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+        return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 /* We break the rule and mangle the string. */
@@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
        unsigned int i;
        for (i = 0; i < num; i++)
-                if (params[i].perm & KPARAM_KMALLOCED)
+                if (params[i].flags & KPARAM_KMALLOCED)
                        kfree(*(char **)params[i].arg);
 }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..f274e1959885
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4860 @@
+/*
+ * Performance counter core code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ *  For licensing details see kernel-base/COPYING
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_counter.h>
+#include <asm/irq_regs.h>
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+int perf_max_counters __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+static atomic_t nr_counters __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
+/*
+ * perf counter paranoia level:
+ *  0 - not paranoid
+ *  1 - disallow cpu counters to unpriv
+ *  2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly;
+static inline bool perf_paranoid_cpu(void)
+{
+        return sysctl_perf_counter_paranoid > 0;
+}
+static inline bool perf_paranoid_kernel(void)
+{
+        return sysctl_perf_counter_paranoid > 1;
+}
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
+static atomic64_t perf_counter_id;
+/*
+ * Lock for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+        return NULL;
+}
+void __weak hw_perf_disable(void)               { barrier(); }
+void __weak hw_perf_enable(void)                { barrier(); }
+void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu)       { barrier(); }
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
+               struct perf_cpu_context *cpuctx,
+               struct perf_counter_context *ctx, int cpu)
+{
+        return 0;
+}
+void __weak perf_counter_print_debug(void)      { }
+static DEFINE_PER_CPU(int, disable_count);
+void __perf_disable(void)
+{
+        __get_cpu_var(disable_count)++;
+}
+bool __perf_enable(void)
+{
+        return !--__get_cpu_var(disable_count);
+}
+void perf_disable(void)
+{
+        __perf_disable();
+        hw_perf_disable();
+}
+void perf_enable(void)
+{
+        if (__perf_enable())
+                hw_perf_enable();
+}
+static void get_ctx(struct perf_counter_context *ctx)
+{
+        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+static void free_ctx(struct rcu_head *head)
+{
+        struct perf_counter_context *ctx;
+        ctx = container_of(head, struct perf_counter_context, rcu_head);
+        kfree(ctx);
+}
+static void put_ctx(struct perf_counter_context *ctx)
+{
+        if (atomic_dec_and_test(&ctx->refcount)) {
+                if (ctx->parent_ctx)
+                        put_ctx(ctx->parent_ctx);
+                if (ctx->task)
+                        put_task_struct(ctx->task);
+                call_rcu(&ctx->rcu_head, free_ctx);
+        }
+}
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+        if (ctx->parent_ctx) {
+                put_ctx(ctx->parent_ctx);
+                ctx->parent_ctx = NULL;
+        }
+}
+/*
+ * If we inherit counters we want to return the parent counter id
+ * to userspace.
+ */
+static u64 primary_counter_id(struct perf_counter *counter)
+{
+        u64 id = counter->id;
+        if (counter->parent)
+                id = counter->parent->id;
+        return id;
+}
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+        struct perf_counter_context *ctx;
+        rcu_read_lock();
+ retry:
+        ctx = rcu_dereference(task->perf_counter_ctxp);
+        if (ctx) {
+                /*
+                 * If this context is a clone of another, it might
+                 * get swapped for another underneath us by
+                 * perf_counter_task_sched_out, though the
+                 * rcu_read_lock() protects us from any context
+                 * getting freed.  Lock the context and check if it
+                 * got swapped before we could get the lock, and retry
+                 * if so.  If we locked the right context, then it
+                 * can't get swapped on us any more.
+                 */
+                spin_lock_irqsave(&ctx->lock, *flags);
+                if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        goto retry;
+                }
+                if (!atomic_inc_not_zero(&ctx->refcount)) {
+                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        ctx = NULL;
+                }
+        }
+        rcu_read_unlock();
+        return ctx;
+}
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+        struct perf_counter_context *ctx;
+        unsigned long flags;
+        ctx = perf_lock_task_context(task, &flags);
+        if (ctx) {
+                ++ctx->pin_count;
+                spin_unlock_irqrestore(&ctx->lock, flags);
+        }
+        return ctx;
+}
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ctx->lock, flags);
+        --ctx->pin_count;
+        spin_unlock_irqrestore(&ctx->lock, flags);
+        put_ctx(ctx);
+}
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+        struct perf_counter *group_leader = counter->group_leader;
+        /*
+         * Depending on whether it is a standalone or sibling counter,
+         * add it straight to the context's counter list, or to the group
+         * leader's sibling list:
+         */
+        if (group_leader == counter)
+                list_add_tail(&counter->list_entry, &ctx->counter_list);
+        else {
+                list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+                group_leader->nr_siblings++;
+        }
+        list_add_rcu(&counter->event_entry, &ctx->event_list);
+        ctx->nr_counters++;
+        if (counter->attr.inherit_stat)
+                ctx->nr_stat++;
+}
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+        struct perf_counter *sibling, *tmp;
+        if (list_empty(&counter->list_entry))
+                return;
+        ctx->nr_counters--;
+        if (counter->attr.inherit_stat)
+                ctx->nr_stat--;
+        list_del_init(&counter->list_entry);
+        list_del_rcu(&counter->event_entry);
+        if (counter->group_leader != counter)
+                counter->group_leader->nr_siblings--;
+        /*
+         * If this was a group counter with sibling counters then
+         * upgrade the siblings to singleton counters by adding them
+         * to the context list directly:
+         */
+        list_for_each_entry_safe(sibling, tmp,
+                                 &counter->sibling_list, list_entry) {
+                list_move_tail(&sibling->list_entry, &ctx->counter_list);
+                sibling->group_leader = sibling;
+        }
+}
+static void
+counter_sched_out(struct perf_counter *counter,
+                  struct perf_cpu_context *cpuctx,
+                  struct perf_counter_context *ctx)
+{
+        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                return;
+        counter->state = PERF_COUNTER_STATE_INACTIVE;
+        if (counter->pending_disable) {
+                counter->pending_disable = 0;
+                counter->state = PERF_COUNTER_STATE_OFF;
+        }
+        counter->tstamp_stopped = ctx->time;
+        counter->pmu->disable(counter);
+        counter->oncpu = -1;
+        if (!is_software_counter(counter))
+                cpuctx->active_oncpu--;
+        ctx->nr_active--;
+        if (counter->attr.exclusive || !cpuctx->active_oncpu)
+                cpuctx->exclusive = 0;
+}
+static void
+group_sched_out(struct perf_counter *group_counter,
+                struct perf_cpu_context *cpuctx,
+                struct perf_counter_context *ctx)
+{
+        struct perf_counter *counter;
+        if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+                return;
+        counter_sched_out(group_counter, cpuctx, ctx);
+        /*
+         * Schedule out siblings (if any):
+         */
+        list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+                counter_sched_out(counter, cpuctx, ctx);
+        if (group_counter->attr.exclusive)
+                cpuctx->exclusive = 0;
+}
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_counter_remove_from_context(void *info)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_counter *counter = info;
+        struct perf_counter_context *ctx = counter->ctx;
+        /*
+         * If this is a task context, we need to check whether it is
+         * the current task context of this cpu. If not it has been
+         * scheduled out before the smp call arrived.
+         */
+        if (ctx->task && cpuctx->task_ctx != ctx)
+                return;
+        spin_lock(&ctx->lock);
+        /*
+         * Protect the list operation against NMI by disabling the
+         * counters on a global level.
+         */
+        perf_disable();
+        counter_sched_out(counter, cpuctx, ctx);
+        list_del_counter(counter, ctx);
+        if (!ctx->task) {
+                /*
+                 * Allow more per task counters with respect to the
+                 * reservation:
+                 */
+                cpuctx->max_pertask =
+                        min(perf_max_counters - ctx->nr_counters,
+                            perf_max_counters - perf_reserved_percpu);
+        }
+        perf_enable();
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_counter_remove_from_context(struct perf_counter *counter)
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        struct task_struct *task = ctx->task;
+        if (!task) {
+                /*
+                 * Per cpu counters are removed via an smp call and
+                 * the removal is always sucessful.
+                 */
+                smp_call_function_single(counter->cpu,
+                                         __perf_counter_remove_from_context,
+                                         counter, 1);
+                return;
+        }
+retry:
+        task_oncpu_function_call(task, __perf_counter_remove_from_context,
+                                 counter);
+        spin_lock_irq(&ctx->lock);
+        /*
+         * If the context is active we need to retry the smp call.
+         */
+        if (ctx->nr_active && !list_empty(&counter->list_entry)) {
+                spin_unlock_irq(&ctx->lock);
+                goto retry;
+        }
+        /*
+         * The lock prevents that this context is scheduled in so we
+         * can remove the counter safely, if the call above did not
+         * succeed.
+         */
+        if (!list_empty(&counter->list_entry)) {
+                list_del_counter(counter, ctx);
+        }
+        spin_unlock_irq(&ctx->lock);
+}
+static inline u64 perf_clock(void)
+{
+        return cpu_clock(smp_processor_id());
+}
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx)
+{
+        u64 now = perf_clock();
+        ctx->time += now - ctx->timestamp;
+        ctx->timestamp = now;
+}
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        u64 run_end;
+        if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+                return;
+        counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+        if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+                run_end = counter->tstamp_stopped;
+        else
+                run_end = ctx->time;
+        counter->total_time_running = run_end - counter->tstamp_running;
+}
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+        struct perf_counter *counter;
+        update_counter_times(leader);
+        list_for_each_entry(counter, &leader->sibling_list, list_entry)
+                update_counter_times(counter);
+}
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+        struct perf_counter *counter = info;
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_counter_context *ctx = counter->ctx;
+        /*
+         * If this is a per-task counter, need to check whether this
+         * counter's task is the current task on this cpu.
+         */
+        if (ctx->task && cpuctx->task_ctx != ctx)
+                return;
+        spin_lock(&ctx->lock);
+        /*
+         * If the counter is on, turn it off.
+         * If it is in error state, leave it in error state.
+         */
+        if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+                update_context_time(ctx);
+                update_counter_times(counter);
+                if (counter == counter->group_leader)
+                        group_sched_out(counter, cpuctx, ctx);
+                else
+                        counter_sched_out(counter, cpuctx, ctx);
+                counter->state = PERF_COUNTER_STATE_OFF;
+        }
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        struct task_struct *task = ctx->task;
+        if (!task) {
+                /*
+                 * Disable the counter on the cpu that it's on
+                 */
+                smp_call_function_single(counter->cpu, __perf_counter_disable,
+                                         counter, 1);
+                return;
+        }
+ retry:
+        task_oncpu_function_call(task, __perf_counter_disable, counter);
+        spin_lock_irq(&ctx->lock);
+        /*
+         * If the counter is still active, we need to retry the cross-call.
+         */
+        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+                spin_unlock_irq(&ctx->lock);
+                goto retry;
+        }
+        /*
+         * Since we have the lock this context can't be scheduled
+         * in, so we can change the state safely.
+         */
+        if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                update_counter_times(counter);
+                counter->state = PERF_COUNTER_STATE_OFF;
+        }
+        spin_unlock_irq(&ctx->lock);
+}
+static int
+counter_sched_in(struct perf_counter *counter,
+                 struct perf_cpu_context *cpuctx,
+                 struct perf_counter_context *ctx,
+                 int cpu)
+{
+        if (counter->state <= PERF_COUNTER_STATE_OFF)
+                return 0;
+        counter->state = PERF_COUNTER_STATE_ACTIVE;
+        counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
+        /*
+         * The new state must be visible before we turn it on in the hardware:
+         */
+        smp_wmb();
+        if (counter->pmu->enable(counter)) {
+                counter->state = PERF_COUNTER_STATE_INACTIVE;
+                counter->oncpu = -1;
+                return -EAGAIN;
+        }
+        counter->tstamp_running += ctx->time - counter->tstamp_stopped;
+        if (!is_software_counter(counter))
+                cpuctx->active_oncpu++;
+        ctx->nr_active++;
+        if (counter->attr.exclusive)
+                cpuctx->exclusive = 1;
+        return 0;
+}
+static int
+group_sched_in(struct perf_counter *group_counter,
+               struct perf_cpu_context *cpuctx,
+               struct perf_counter_context *ctx,
+               int cpu)
+{
+        struct perf_counter *counter, *partial_group;
+        int ret;
+        if (group_counter->state == PERF_COUNTER_STATE_OFF)
+                return 0;
+        ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+        if (ret)
+                return ret < 0 ? ret : 0;
+        if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+                return -EAGAIN;
+        /*
+         * Schedule in siblings as one group (if any):
+         */
+        list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+                if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+                        partial_group = counter;
+                        goto group_error;
+                }
+        }
+        return 0;
+group_error:
+        /*
+         * Groups can be scheduled in as one unit only, so undo any
+         * partial group before returning:
+         */
+        list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+                if (counter == partial_group)
+                        break;
+                counter_sched_out(counter, cpuctx, ctx);
+        }
+        counter_sched_out(group_counter, cpuctx, ctx);
+        return -EAGAIN;
+}
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+        struct perf_counter *counter;
+        if (!is_software_counter(leader))
+                return 0;
+        list_for_each_entry(counter, &leader->sibling_list, list_entry)
+                if (!is_software_counter(counter))
+                        return 0;
+        return 1;
+}
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+                           struct perf_cpu_context *cpuctx,
+                           int can_add_hw)
+{
+        /*
+         * Groups consisting entirely of software counters can always go on.
+         */
+        if (is_software_only_group(counter))
+                return 1;
+        /*
+         * If an exclusive group is already on, no other hardware
+         * counters can go on.
+         */
+        if (cpuctx->exclusive)
+                return 0;
+        /*
+         * If this group is exclusive and there are already
+         * counters on the CPU, it can't go on.
+         */
+        if (counter->attr.exclusive && cpuctx->active_oncpu)
+                return 0;
+        /*
+         * Otherwise, try to add it if all previous groups were able
+         * to go on.
+         */
+        return can_add_hw;
+}
+static void add_counter_to_ctx(struct perf_counter *counter,
+                               struct perf_counter_context *ctx)
+{
+        list_add_counter(counter, ctx);
+        counter->tstamp_enabled = ctx->time;
+        counter->tstamp_running = ctx->time;
+        counter->tstamp_stopped = ctx->time;
+}
+/*
+ * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_counter *counter = info;
+        struct perf_counter_context *ctx = counter->ctx;
+        struct perf_counter *leader = counter->group_leader;
+        int cpu = smp_processor_id();
+        int err;
+        /*
+         * If this is a task context, we need to check whether it is
+         * the current task context of this cpu. If not it has been
+         * scheduled out before the smp call arrived.
+         * Or possibly this is the right context but it isn't
+         * on this cpu because it had no counters.
+         */
+        if (ctx->task && cpuctx->task_ctx != ctx) {
+                if (cpuctx->task_ctx || ctx->task != current)
+                        return;
+                cpuctx->task_ctx = ctx;
+        }
+        spin_lock(&ctx->lock);
+        ctx->is_active = 1;
+        update_context_time(ctx);
+        /*
+         * Protect the list operation against NMI by disabling the
+         * counters on a global level. NOP for non NMI based counters.
+         */
+        perf_disable();
+        add_counter_to_ctx(counter, ctx);
+        /*
+         * Don't put the counter on if it is disabled or if
+         * it is in a group and the group isn't on.
+         */
+        if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+            (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+                goto unlock;
+        /*
+         * An exclusive counter can't go on if there are already active
+         * hardware counters, and no hardware counter can go on if there
+         * is already an exclusive counter on.
+         */
+        if (!group_can_go_on(counter, cpuctx, 1))
+                err = -EEXIST;
+        else
+                err = counter_sched_in(counter, cpuctx, ctx, cpu);
+        if (err) {
+                /*
+                 * This counter couldn't go on.  If it is in a group
+                 * then we have to pull the whole group off.
+                 * If the counter group is pinned then put it in error state.
+                 */
+                if (leader != counter)
+                        group_sched_out(leader, cpuctx, ctx);
+                if (leader->attr.pinned) {
+                        update_group_times(leader);
+                        leader->state = PERF_COUNTER_STATE_ERROR;
+                }
+        }
+        if (!err && !ctx->task && cpuctx->max_pertask)
+                cpuctx->max_pertask--;
+ unlock:
+        perf_enable();
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+                        struct perf_counter *counter,
+                        int cpu)
+{
+        struct task_struct *task = ctx->task;
+        if (!task) {
+                /*
+                 * Per cpu counters are installed via an smp call and
+                 * the install is always sucessful.
+                 */
+                smp_call_function_single(cpu, __perf_install_in_context,
+                                         counter, 1);
+                return;
+        }
+retry:
+        task_oncpu_function_call(task, __perf_install_in_context,
+                                 counter);
+        spin_lock_irq(&ctx->lock);
+        /*
+         * we need to retry the smp call.
+         */
+        if (ctx->is_active && list_empty(&counter->list_entry)) {
+                spin_unlock_irq(&ctx->lock);
+                goto retry;
+        }
+        /*
+         * The lock prevents that this context is scheduled in so we
+         * can add the counter safely, if it the call above did not
+         * succeed.
+         */
+        if (list_empty(&counter->list_entry))
+                add_counter_to_ctx(counter, ctx);
+        spin_unlock_irq(&ctx->lock);
+}
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
+{
+        struct perf_counter *counter = info;
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_counter_context *ctx = counter->ctx;
+        struct perf_counter *leader = counter->group_leader;
+        int err;
+        /*
+         * If this is a per-task counter, need to check whether this
+         * counter's task is the current task on this cpu.
+         */
+        if (ctx->task && cpuctx->task_ctx != ctx) {
+                if (cpuctx->task_ctx || ctx->task != current)
+                        return;
+                cpuctx->task_ctx = ctx;
+        }
+        spin_lock(&ctx->lock);
+        ctx->is_active = 1;
+        update_context_time(ctx);
+        if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+                goto unlock;
+        counter->state = PERF_COUNTER_STATE_INACTIVE;
+        counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+        /*
+         * If the counter is in a group and isn't the group leader,
+         * then don't put it on unless the group is on.
+         */
+        if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+                goto unlock;
+        if (!group_can_go_on(counter, cpuctx, 1)) {
+                err = -EEXIST;
+        } else {
+                perf_disable();
+                if (counter == leader)
+                        err = group_sched_in(counter, cpuctx, ctx,
+                                             smp_processor_id());
+                else
+                        err = counter_sched_in(counter, cpuctx, ctx,
+                                               smp_processor_id());
+                perf_enable();
+        }
+        if (err) {
+                /*
+                 * If this counter can't go on and it's part of a
+                 * group, then the whole group has to come off.
+                 */
+                if (leader != counter)
+                        group_sched_out(leader, cpuctx, ctx);
+                if (leader->attr.pinned) {
+                        update_group_times(leader);
+                        leader->state = PERF_COUNTER_STATE_ERROR;
+                }
+        }
+ unlock:
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        struct task_struct *task = ctx->task;
+        if (!task) {
+                /*
+                 * Enable the counter on the cpu that it's on
+                 */
+                smp_call_function_single(counter->cpu, __perf_counter_enable,
+                                         counter, 1);
+                return;
+        }
+        spin_lock_irq(&ctx->lock);
+        if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+                goto out;
+        /*
+         * If the counter is in error state, clear that first.
+         * That way, if we see the counter in error state below, we
+         * know that it has gone back into error state, as distinct
+         * from the task having been scheduled away before the
+         * cross-call arrived.
+         */
+        if (counter->state == PERF_COUNTER_STATE_ERROR)
+                counter->state = PERF_COUNTER_STATE_OFF;
+ retry:
+        spin_unlock_irq(&ctx->lock);
+        task_oncpu_function_call(task, __perf_counter_enable, counter);
+        spin_lock_irq(&ctx->lock);
+        /*
+         * If the context is active and the counter is still off,
+         * we need to retry the cross-call.
+         */
+        if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+                goto retry;
+        /*
+         * Since we have the lock this context can't be scheduled
+         * in, so we can change the state safely.
+         */
+        if (counter->state == PERF_COUNTER_STATE_OFF) {
+                counter->state = PERF_COUNTER_STATE_INACTIVE;
+                counter->tstamp_enabled =
+                        ctx->time - counter->total_time_enabled;
+        }
+ out:
+        spin_unlock_irq(&ctx->lock);
+}
+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+        /*
+         * not supported on inherited counters
+         */
+        if (counter->attr.inherit)
+                return -EINVAL;
+        atomic_add(refresh, &counter->event_limit);
+        perf_counter_enable(counter);
+        return 0;
+}
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+                              struct perf_cpu_context *cpuctx)
+{
+        struct perf_counter *counter;
+        spin_lock(&ctx->lock);
+        ctx->is_active = 0;
+        if (likely(!ctx->nr_counters))
+                goto out;
+        update_context_time(ctx);
+        perf_disable();
+        if (ctx->nr_active) {
+                list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                        if (counter != counter->group_leader)
+                                counter_sched_out(counter, cpuctx, ctx);
+                        else
+                                group_sched_out(counter, cpuctx, ctx);
+                }
+        }
+        perf_enable();
+ out:
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+                         struct perf_counter_context *ctx2)
+{
+        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+                && ctx1->parent_gen == ctx2->parent_gen
+                && !ctx1->pin_count && !ctx2->pin_count;
+}
+static void __perf_counter_read(void *counter);
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+                                     struct perf_counter *next_counter)
+{
+        u64 value;
+        if (!counter->attr.inherit_stat)
+                return;
+        /*
+         * Update the counter value, we cannot use perf_counter_read()
+         * because we're in the middle of a context switch and have IRQs
+         * disabled, which upsets smp_call_function_single(), however
+         * we know the counter must be on the current CPU, therefore we
+         * don't need to use it.
+         */
+        switch (counter->state) {
+        case PERF_COUNTER_STATE_ACTIVE:
+                __perf_counter_read(counter);
+                break;
+        case PERF_COUNTER_STATE_INACTIVE:
+                update_counter_times(counter);
+                break;
+        default:
+                break;
+        }
+        /*
+         * In order to keep per-task stats reliable we need to flip the counter
+         * values when we flip the contexts.
+         */
+        value = atomic64_read(&next_counter->count);
+        value = atomic64_xchg(&counter->count, value);
+        atomic64_set(&next_counter->count, value);
+        swap(counter->total_time_enabled, next_counter->total_time_enabled);
+        swap(counter->total_time_running, next_counter->total_time_running);
+        /*
+         * Since we swizzled the values, update the user visible data too.
+         */
+        perf_counter_update_userpage(counter);
+        perf_counter_update_userpage(next_counter);
+}
+#define list_next_entry(pos, member) \
+        list_entry(pos->member.next, typeof(*pos), member)
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+                                   struct perf_counter_context *next_ctx)
+{
+        struct perf_counter *counter, *next_counter;
+        if (!ctx->nr_stat)
+                return;
+        counter = list_first_entry(&ctx->event_list,
+                                   struct perf_counter, event_entry);
+        next_counter = list_first_entry(&next_ctx->event_list,
+                                        struct perf_counter, event_entry);
+        while (&counter->event_entry != &ctx->event_list &&
+               &next_counter->event_entry != &next_ctx->event_list) {
+                __perf_counter_sync_stat(counter, next_counter);
+                counter = list_next_entry(counter, event_entry);
+                next_counter = list_next_entry(next_counter, event_entry);
+        }
+}
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task,
+                                 struct task_struct *next, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_counter_context *ctx = task->perf_counter_ctxp;
+        struct perf_counter_context *next_ctx;
+        struct perf_counter_context *parent;
+        struct pt_regs *regs;
+        int do_switch = 1;
+        regs = task_pt_regs(task);
+        perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+        if (likely(!ctx || !cpuctx->task_ctx))
+                return;
+        update_context_time(ctx);
+        rcu_read_lock();
+        parent = rcu_dereference(ctx->parent_ctx);
+        next_ctx = next->perf_counter_ctxp;
+        if (parent && next_ctx &&
+            rcu_dereference(next_ctx->parent_ctx) == parent) {
+                /*
+                 * Looks like the two contexts are clones, so we might be
+                 * able to optimize the context switch.  We lock both
+                 * contexts and check that they are clones under the
+                 * lock (including re-checking that neither has been
+                 * uncloned in the meantime).  It doesn't matter which
+                 * order we take the locks because no other cpu could
+                 * be trying to lock both of these tasks.
+                 */
+                spin_lock(&ctx->lock);
+                spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+                if (context_equiv(ctx, next_ctx)) {
+                        /*
+                         * XXX do we need a memory barrier of sorts
+                         * wrt to rcu_dereference() of perf_counter_ctxp
+                         */
+                        task->perf_counter_ctxp = next_ctx;
+                        next->perf_counter_ctxp = ctx;
+                        ctx->task = next;
+                        next_ctx->task = task;
+                        do_switch = 0;
+                        perf_counter_sync_stat(ctx, next_ctx);
+                }
+                spin_unlock(&next_ctx->lock);
+                spin_unlock(&ctx->lock);
+        }
+        rcu_read_unlock();
+        if (do_switch) {
+                __perf_counter_sched_out(ctx, cpuctx);
+                cpuctx->task_ctx = NULL;
+        }
+}
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        if (!cpuctx->task_ctx)
+                return;
+        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+                return;
+        __perf_counter_sched_out(ctx, cpuctx);
+        cpuctx->task_ctx = NULL;
+}
+/*
+ * Called with IRQs disabled
+ */
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+        __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
+}
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+                        struct perf_cpu_context *cpuctx, int cpu)
+{
+        struct perf_counter *counter;
+        int can_add_hw = 1;
+        spin_lock(&ctx->lock);
+        ctx->is_active = 1;
+        if (likely(!ctx->nr_counters))
+                goto out;
+        ctx->timestamp = perf_clock();
+        perf_disable();
+        /*
+         * First go through the list and put on any pinned groups
+         * in order to give them the best chance of going on.
+         */
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                    !counter->attr.pinned)
+                        continue;
+                if (counter->cpu != -1 && counter->cpu != cpu)
+                        continue;
+                if (counter != counter->group_leader)
+                        counter_sched_in(counter, cpuctx, ctx, cpu);
+                else {
+                        if (group_can_go_on(counter, cpuctx, 1))
+                                group_sched_in(counter, cpuctx, ctx, cpu);
+                }
+                /*
+                 * If this pinned group hasn't been scheduled,
+                 * put it in error state.
+                 */
+                if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                        update_group_times(counter);
+                        counter->state = PERF_COUNTER_STATE_ERROR;
+                }
+        }
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                /*
+                 * Ignore counters in OFF or ERROR state, and
+                 * ignore pinned counters since we did them already.
+                 */
+                if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                    counter->attr.pinned)
+                        continue;
+                /*
+                 * Listen to the 'cpu' scheduling filter constraint
+                 * of counters:
+                 */
+                if (counter->cpu != -1 && counter->cpu != cpu)
+                        continue;
+                if (counter != counter->group_leader) {
+                        if (counter_sched_in(counter, cpuctx, ctx, cpu))
+                                can_add_hw = 0;
+                } else {
+                        if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+                                if (group_sched_in(counter, cpuctx, ctx, cpu))
+                                        can_add_hw = 0;
+                        }
+                }
+        }
+        perf_enable();
+ out:
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_counter_context *ctx = task->perf_counter_ctxp;
+        if (likely(!ctx))
+                return;
+        if (cpuctx->task_ctx == ctx)
+                return;
+        __perf_counter_sched_in(ctx, cpuctx, cpu);
+        cpuctx->task_ctx = ctx;
+}
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+        struct perf_counter_context *ctx = &cpuctx->ctx;
+        __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_counter *counter, int enable);
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        u64 period, sample_period;
+        s64 delta;
+        events *= hwc->sample_period;
+        period = div64_u64(events, counter->attr.sample_freq);
+        delta = (s64)(period - hwc->sample_period);
+        delta = (delta + 7) / 8; /* low pass filter */
+        sample_period = hwc->sample_period + delta;
+        if (!sample_period)
+                sample_period = 1;
+        hwc->sample_period = sample_period;
+}
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
+{
+        struct perf_counter *counter;
+        struct hw_perf_counter *hwc;
+        u64 interrupts, freq;
+        spin_lock(&ctx->lock);
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                        continue;
+                hwc = &counter->hw;
+                interrupts = hwc->interrupts;
+                hwc->interrupts = 0;
+                /*
+                 * unthrottle counters on the tick
+                 */
+                if (interrupts == MAX_INTERRUPTS) {
+                        perf_log_throttle(counter, 1);
+                        counter->pmu->unthrottle(counter);
+                        interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
+                }
+                if (!counter->attr.freq || !counter->attr.sample_freq)
+                        continue;
+                /*
+                 * if the specified freq < HZ then we need to skip ticks
+                 */
+                if (counter->attr.sample_freq < HZ) {
+                        freq = counter->attr.sample_freq;
+                        hwc->freq_count += freq;
+                        hwc->freq_interrupts += interrupts;
+                        if (hwc->freq_count < HZ)
+                                continue;
+                        interrupts = hwc->freq_interrupts;
+                        hwc->freq_interrupts = 0;
+                        hwc->freq_count -= HZ;
+                } else
+                        freq = HZ;
+                perf_adjust_period(counter, freq * interrupts);
+                /*
+                 * In order to avoid being stalled by an (accidental) huge
+                 * sample period, force reset the sample period if we didn't
+                 * get any events in this freq period.
+                 */
+                if (!interrupts) {
+                        perf_disable();
+                        counter->pmu->disable(counter);
+                        atomic64_set(&hwc->period_left, 0);
+                        counter->pmu->enable(counter);
+                        perf_enable();
+                }
+        }
+        spin_unlock(&ctx->lock);
+}
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
+{
+        struct perf_counter *counter;
+        if (!ctx->nr_counters)
+                return;
+        spin_lock(&ctx->lock);
+        /*
+         * Rotate the first entry last (works just fine for group counters too):
+         */
+        perf_disable();
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                list_move_tail(&counter->list_entry, &ctx->counter_list);
+                break;
+        }
+        perf_enable();
+        spin_unlock(&ctx->lock);
+}
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+        struct perf_cpu_context *cpuctx;
+        struct perf_counter_context *ctx;
+        if (!atomic_read(&nr_counters))
+                return;
+        cpuctx = &per_cpu(perf_cpu_context, cpu);
+        ctx = curr->perf_counter_ctxp;
+        perf_ctx_adjust_freq(&cpuctx->ctx);
+        if (ctx)
+                perf_ctx_adjust_freq(ctx);
+        perf_counter_cpu_sched_out(cpuctx);
+        if (ctx)
+                __perf_counter_task_sched_out(ctx);
+        rotate_ctx(&cpuctx->ctx);
+        if (ctx)
+                rotate_ctx(ctx);
+        perf_counter_cpu_sched_in(cpuctx, cpu);
+        if (ctx)
+                perf_counter_task_sched_in(curr, cpu);
+}
+/*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+        struct perf_counter_context *ctx;
+        struct perf_counter *counter;
+        unsigned long flags;
+        int enabled = 0;
+        local_irq_save(flags);
+        ctx = task->perf_counter_ctxp;
+        if (!ctx || !ctx->nr_counters)
+                goto out;
+        __perf_counter_task_sched_out(ctx);
+        spin_lock(&ctx->lock);
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                if (!counter->attr.enable_on_exec)
+                        continue;
+                counter->attr.enable_on_exec = 0;
+                if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+                        continue;
+                counter->state = PERF_COUNTER_STATE_INACTIVE;
+                counter->tstamp_enabled =
+                        ctx->time - counter->total_time_enabled;
+                enabled = 1;
+        }
+        /*
+         * Unclone this context if we enabled any counter.
+         */
+        if (enabled)
+                unclone_ctx(ctx);
+        spin_unlock(&ctx->lock);
+        perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+        local_irq_restore(flags);
+}
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __perf_counter_read(void *info)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_counter *counter = info;
+        struct perf_counter_context *ctx = counter->ctx;
+        unsigned long flags;
+        /*
+         * If this is a task context, we need to check whether it is
+         * the current task context of this cpu.  If not it has been
+         * scheduled out before the smp call arrived.  In that case
+         * counter->count would have been updated to a recent sample
+         * when the counter was scheduled out.
+         */
+        if (ctx->task && cpuctx->task_ctx != ctx)
+                return;
+        local_irq_save(flags);
+        if (ctx->is_active)
+                update_context_time(ctx);
+        counter->pmu->read(counter);
+        update_counter_times(counter);
+        local_irq_restore(flags);
+}
+static u64 perf_counter_read(struct perf_counter *counter)
+{
+        /*
+         * If counter is enabled and currently active on a CPU, update the
+         * value in the counter structure:
+         */
+        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+                smp_call_function_single(counter->oncpu,
+                                         __perf_counter_read, counter, 1);
+        } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                update_counter_times(counter);
+        }
+        return atomic64_read(&counter->count);
+}
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+                            struct task_struct *task)
+{
+        memset(ctx, 0, sizeof(*ctx));
+        spin_lock_init(&ctx->lock);
+        mutex_init(&ctx->mutex);
+        INIT_LIST_HEAD(&ctx->counter_list);
+        INIT_LIST_HEAD(&ctx->event_list);
+        atomic_set(&ctx->refcount, 1);
+        ctx->task = task;
+}
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+        struct perf_counter_context *ctx;
+        struct perf_cpu_context *cpuctx;
+        struct task_struct *task;
+        unsigned long flags;
+        int err;
+        /*
+         * If cpu is not a wildcard then this is a percpu counter:
+         */
+        if (cpu != -1) {
+                /* Must be root to operate on a CPU counter: */
+                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                        return ERR_PTR(-EACCES);
+                if (cpu < 0 || cpu > num_possible_cpus())
+                        return ERR_PTR(-EINVAL);
+                /*
+                 * We could be clever and allow to attach a counter to an
+                 * offline CPU and activate it when the CPU comes up, but
+                 * that's for later.
+                 */
+                if (!cpu_isset(cpu, cpu_online_map))
+                        return ERR_PTR(-ENODEV);
+                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                ctx = &cpuctx->ctx;
+                get_ctx(ctx);
+                return ctx;
+        }
+        rcu_read_lock();
+        if (!pid)
+                task = current;
+        else
+                task = find_task_by_vpid(pid);
+        if (task)
+                get_task_struct(task);
+        rcu_read_unlock();
+        if (!task)
+                return ERR_PTR(-ESRCH);
+        /*
+         * Can't attach counters to a dying task.
+         */
+        err = -ESRCH;
+        if (task->flags & PF_EXITING)
+                goto errout;
+        /* Reuse ptrace permission checks for now. */
+        err = -EACCES;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto errout;
+ retry:
+        ctx = perf_lock_task_context(task, &flags);
+        if (ctx) {
+                unclone_ctx(ctx);
+                spin_unlock_irqrestore(&ctx->lock, flags);
+        }
+        if (!ctx) {
+                ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+                err = -ENOMEM;
+                if (!ctx)
+                        goto errout;
+                __perf_counter_init_context(ctx, task);
+                get_ctx(ctx);
+                if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
+                        /*
+                         * We raced with some other task; use
+                         * the context they set.
+                         */
+                        kfree(ctx);
+                        goto retry;
+                }
+                get_task_struct(task);
+        }
+        put_task_struct(task);
+        return ctx;
+ errout:
+        put_task_struct(task);
+        return ERR_PTR(err);
+}
+static void free_counter_rcu(struct rcu_head *head)
+{
+        struct perf_counter *counter;
+        counter = container_of(head, struct perf_counter, rcu_head);
+        if (counter->ns)
+                put_pid_ns(counter->ns);
+        kfree(counter);
+}
+static void perf_pending_sync(struct perf_counter *counter);
+static void free_counter(struct perf_counter *counter)
+{
+        perf_pending_sync(counter);
+        if (!counter->parent) {
+                atomic_dec(&nr_counters);
+                if (counter->attr.mmap)
+                        atomic_dec(&nr_mmap_counters);
+                if (counter->attr.comm)
+                        atomic_dec(&nr_comm_counters);
+                if (counter->attr.task)
+                        atomic_dec(&nr_task_counters);
+        }
+        if (counter->destroy)
+                counter->destroy(counter);
+        put_ctx(counter->ctx);
+        call_rcu(&counter->rcu_head, free_counter_rcu);
+}
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+        struct perf_counter *counter = file->private_data;
+        struct perf_counter_context *ctx = counter->ctx;
+        file->private_data = NULL;
+        WARN_ON_ONCE(ctx->parent_ctx);
+        mutex_lock(&ctx->mutex);
+        perf_counter_remove_from_context(counter);
+        mutex_unlock(&ctx->mutex);
+        mutex_lock(&counter->owner->perf_counter_mutex);
+        list_del_init(&counter->owner_entry);
+        mutex_unlock(&counter->owner->perf_counter_mutex);
+        put_task_struct(counter->owner);
+        free_counter(counter);
+        return 0;
+}
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+        int entry = sizeof(u64); /* value */
+        int size = 0;
+        int nr = 1;
+        if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                size += sizeof(u64);
+        if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                size += sizeof(u64);
+        if (counter->attr.read_format & PERF_FORMAT_ID)
+                entry += sizeof(u64);
+        if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+                nr += counter->group_leader->nr_siblings;
+                size += sizeof(u64);
+        }
+        size += entry * nr;
+        return size;
+}
+static u64 perf_counter_read_value(struct perf_counter *counter)
+{
+        struct perf_counter *child;
+        u64 total = 0;
+        total += perf_counter_read(counter);
+        list_for_each_entry(child, &counter->child_list, child_list)
+                total += perf_counter_read(child);
+        return total;
+}
+static int perf_counter_read_entry(struct perf_counter *counter,
+                                   u64 read_format, char __user *buf)
+{
+        int n = 0, count = 0;
+        u64 values[2];
+        values[n++] = perf_counter_read_value(counter);
+        if (read_format & PERF_FORMAT_ID)
+                values[n++] = primary_counter_id(counter);
+        count = n * sizeof(u64);
+        if (copy_to_user(buf, values, count))
+                return -EFAULT;
+        return count;
+}
+static int perf_counter_read_group(struct perf_counter *counter,
+                                   u64 read_format, char __user *buf)
+{
+        struct perf_counter *leader = counter->group_leader, *sub;
+        int n = 0, size = 0, err = -EFAULT;
+        u64 values[3];
+        values[n++] = 1 + leader->nr_siblings;
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+                values[n++] = leader->total_time_enabled +
+                        atomic64_read(&leader->child_total_time_enabled);
+        }
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+                values[n++] = leader->total_time_running +
+                        atomic64_read(&leader->child_total_time_running);
+        }
+        size = n * sizeof(u64);
+        if (copy_to_user(buf, values, size))
+                return -EFAULT;
+        err = perf_counter_read_entry(leader, read_format, buf + size);
+        if (err < 0)
+                return err;
+        size += err;
+        list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+                err = perf_counter_read_entry(sub, read_format,
+                                buf + size);
+                if (err < 0)
+                        return err;
+                size += err;
+        }
+        return size;
+}
+static int perf_counter_read_one(struct perf_counter *counter,
+                                 u64 read_format, char __user *buf)
+{
+        u64 values[4];
+        int n = 0;
+        values[n++] = perf_counter_read_value(counter);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+                values[n++] = counter->total_time_enabled +
+                        atomic64_read(&counter->child_total_time_enabled);
+        }
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+                values[n++] = counter->total_time_running +
+                        atomic64_read(&counter->child_total_time_running);
+        }
+        if (read_format & PERF_FORMAT_ID)
+                values[n++] = primary_counter_id(counter);
+        if (copy_to_user(buf, values, n * sizeof(u64)))
+                return -EFAULT;
+        return n * sizeof(u64);
+}
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+        u64 read_format = counter->attr.read_format;
+        int ret;
+        /*
+         * Return end-of-file for a read on a counter that is in
+         * error state (i.e. because it was pinned but it couldn't be
+         * scheduled on to the CPU at some point).
+         */
+        if (counter->state == PERF_COUNTER_STATE_ERROR)
+                return 0;
+        if (count < perf_counter_read_size(counter))
+                return -ENOSPC;
+        WARN_ON_ONCE(counter->ctx->parent_ctx);
+        mutex_lock(&counter->child_mutex);
+        if (read_format & PERF_FORMAT_GROUP)
+                ret = perf_counter_read_group(counter, read_format, buf);
+        else
+                ret = perf_counter_read_one(counter, read_format, buf);
+        mutex_unlock(&counter->child_mutex);
+        return ret;
+}
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+        struct perf_counter *counter = file->private_data;
+        return perf_read_hw(counter, buf, count);
+}
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+        struct perf_counter *counter = file->private_data;
+        struct perf_mmap_data *data;
+        unsigned int events = POLL_HUP;
+        rcu_read_lock();
+        data = rcu_dereference(counter->data);
+        if (data)
+                events = atomic_xchg(&data->poll, 0);
+        rcu_read_unlock();
+        poll_wait(file, &counter->waitq, wait);
+        return events;
+}
+static void perf_counter_reset(struct perf_counter *counter)
+{
+        (void)perf_counter_read(counter);
+        atomic64_set(&counter->count, 0);
+        perf_counter_update_userpage(counter);
+}
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
+static void perf_counter_for_each_child(struct perf_counter *counter,
+                                        void (*func)(struct perf_counter *))
+{
+        struct perf_counter *child;
+        WARN_ON_ONCE(counter->ctx->parent_ctx);
+        mutex_lock(&counter->child_mutex);
+        func(counter);
+        list_for_each_entry(child, &counter->child_list, child_list)
+                func(child);
+        mutex_unlock(&counter->child_mutex);
+}
+static void perf_counter_for_each(struct perf_counter *counter,
+                                  void (*func)(struct perf_counter *))
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        struct perf_counter *sibling;
+        WARN_ON_ONCE(ctx->parent_ctx);
+        mutex_lock(&ctx->mutex);
+        counter = counter->group_leader;
+        perf_counter_for_each_child(counter, func);
+        func(counter);
+        list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+                perf_counter_for_each_child(counter, func);
+        mutex_unlock(&ctx->mutex);
+}
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        unsigned long size;
+        int ret = 0;
+        u64 value;
+        if (!counter->attr.sample_period)
+                return -EINVAL;
+        size = copy_from_user(&value, arg, sizeof(value));
+        if (size != sizeof(value))
+                return -EFAULT;
+        if (!value)
+                return -EINVAL;
+        spin_lock_irq(&ctx->lock);
+        if (counter->attr.freq) {
+                if (value > sysctl_perf_counter_sample_rate) {
+                        ret = -EINVAL;
+                        goto unlock;
+                }
+                counter->attr.sample_freq = value;
+        } else {
+                counter->attr.sample_period = value;
+                counter->hw.sample_period = value;
+        }
+unlock:
+        spin_unlock_irq(&ctx->lock);
+        return ret;
+}
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct perf_counter *counter = file->private_data;
+        void (*func)(struct perf_counter *);
+        u32 flags = arg;
+        switch (cmd) {
+        case PERF_COUNTER_IOC_ENABLE:
+                func = perf_counter_enable;
+                break;
+        case PERF_COUNTER_IOC_DISABLE:
+                func = perf_counter_disable;
+                break;
+        case PERF_COUNTER_IOC_RESET:
+                func = perf_counter_reset;
+                break;
+        case PERF_COUNTER_IOC_REFRESH:
+                return perf_counter_refresh(counter, arg);
+        case PERF_COUNTER_IOC_PERIOD:
+                return perf_counter_period(counter, (u64 __user *)arg);
+        default:
+                return -ENOTTY;
+        }
+        if (flags & PERF_IOC_FLAG_GROUP)
+                perf_counter_for_each(counter, func);
+        else
+                perf_counter_for_each_child(counter, func);
+        return 0;
+}
+int perf_counter_task_enable(void)
+{
+        struct perf_counter *counter;
+        mutex_lock(&current->perf_counter_mutex);
+        list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+                perf_counter_for_each_child(counter, perf_counter_enable);
+        mutex_unlock(&current->perf_counter_mutex);
+        return 0;
+}
+int perf_counter_task_disable(void)
+{
+        struct perf_counter *counter;
+        mutex_lock(&current->perf_counter_mutex);
+        list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+                perf_counter_for_each_child(counter, perf_counter_disable);
+        mutex_unlock(&current->perf_counter_mutex);
+        return 0;
+}
+#ifndef PERF_COUNTER_INDEX_OFFSET
+# define PERF_COUNTER_INDEX_OFFSET 0
+#endif
+static int perf_counter_index(struct perf_counter *counter)
+{
+        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                return 0;
+        return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
+}
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+        struct perf_counter_mmap_page *userpg;
+        struct perf_mmap_data *data;
+        rcu_read_lock();
+        data = rcu_dereference(counter->data);
+        if (!data)
+                goto unlock;
+        userpg = data->user_page;
+        /*
+         * Disable preemption so as to not let the corresponding user-space
+         * spin too long if we get preempted.
+         */
+        preempt_disable();
+        ++userpg->lock;
+        barrier();
+        userpg->index = perf_counter_index(counter);
+        userpg->offset = atomic64_read(&counter->count);
+        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+                userpg->offset -= atomic64_read(&counter->hw.prev_count);
+        userpg->time_enabled = counter->total_time_enabled +
+                        atomic64_read(&counter->child_total_time_enabled);
+        userpg->time_running = counter->total_time_running +
+                        atomic64_read(&counter->child_total_time_running);
+        barrier();
+        ++userpg->lock;
+        preempt_enable();
+unlock:
+        rcu_read_unlock();
+}
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct perf_counter *counter = vma->vm_file->private_data;
+        struct perf_mmap_data *data;
+        int ret = VM_FAULT_SIGBUS;
+        if (vmf->flags & FAULT_FLAG_MKWRITE) {
+                if (vmf->pgoff == 0)
+                        ret = 0;
+                return ret;
+        }
+        rcu_read_lock();
+        data = rcu_dereference(counter->data);
+        if (!data)
+                goto unlock;
+        if (vmf->pgoff == 0) {
+                vmf->page = virt_to_page(data->user_page);
+        } else {
+                int nr = vmf->pgoff - 1;
+                if ((unsigned)nr > data->nr_pages)
+                        goto unlock;
+                if (vmf->flags & FAULT_FLAG_WRITE)
+                        goto unlock;
+                vmf->page = virt_to_page(data->data_pages[nr]);
+        }
+        get_page(vmf->page);
+        vmf->page->mapping = vma->vm_file->f_mapping;
+        vmf->page->index   = vmf->pgoff;
+        ret = 0;
+unlock:
+        rcu_read_unlock();
+        return ret;
+}
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+        struct perf_mmap_data *data;
+        unsigned long size;
+        int i;
+        WARN_ON(atomic_read(&counter->mmap_count));
+        size = sizeof(struct perf_mmap_data);
+        size += nr_pages * sizeof(void *);
+        data = kzalloc(size, GFP_KERNEL);
+        if (!data)
+                goto fail;
+        data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!data->user_page)
+                goto fail_user_page;
+        for (i = 0; i < nr_pages; i++) {
+                data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+                if (!data->data_pages[i])
+                        goto fail_data_pages;
+        }
+        data->nr_pages = nr_pages;
+        atomic_set(&data->lock, -1);
+        rcu_assign_pointer(counter->data, data);
+        return 0;
+fail_data_pages:
+        for (i--; i >= 0; i--)
+                free_page((unsigned long)data->data_pages[i]);
+        free_page((unsigned long)data->user_page);
+fail_user_page:
+        kfree(data);
+fail:
+        return -ENOMEM;
+}
+static void perf_mmap_free_page(unsigned long addr)
+{
+        struct page *page = virt_to_page((void *)addr);
+        page->mapping = NULL;
+        __free_page(page);
+}
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+        struct perf_mmap_data *data;
+        int i;
+        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+        perf_mmap_free_page((unsigned long)data->user_page);
+        for (i = 0; i < data->nr_pages; i++)
+                perf_mmap_free_page((unsigned long)data->data_pages[i]);
+        kfree(data);
+}
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+        struct perf_mmap_data *data = counter->data;
+        WARN_ON(atomic_read(&counter->mmap_count));
+        rcu_assign_pointer(counter->data, NULL);
+        call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+        struct perf_counter *counter = vma->vm_file->private_data;
+        atomic_inc(&counter->mmap_count);
+}
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+        struct perf_counter *counter = vma->vm_file->private_data;
+        WARN_ON_ONCE(counter->ctx->parent_ctx);
+        if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
+                struct user_struct *user = current_user();
+                atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
+                vma->vm_mm->locked_vm -= counter->data->nr_locked;
+                perf_mmap_data_free(counter);
+                mutex_unlock(&counter->mmap_mutex);
+        }
+}
+static struct vm_operations_struct perf_mmap_vmops = {
+        .open           = perf_mmap_open,
+        .close          = perf_mmap_close,
+        .fault          = perf_mmap_fault,
+        .page_mkwrite   = perf_mmap_fault,
+};
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct perf_counter *counter = file->private_data;
+        unsigned long user_locked, user_lock_limit;
+        struct user_struct *user = current_user();
+        unsigned long locked, lock_limit;
+        unsigned long vma_size;
+        unsigned long nr_pages;
+        long user_extra, extra;
+        int ret = 0;
+        if (!(vma->vm_flags & VM_SHARED))
+                return -EINVAL;
+        vma_size = vma->vm_end - vma->vm_start;
+        nr_pages = (vma_size / PAGE_SIZE) - 1;
+        /*
+         * If we have data pages ensure they're a power-of-two number, so we
+         * can do bitmasks instead of modulo.
+         */
+        if (nr_pages != 0 && !is_power_of_2(nr_pages))
+                return -EINVAL;
+        if (vma_size != PAGE_SIZE * (1 + nr_pages))
+                return -EINVAL;
+        if (vma->vm_pgoff != 0)
+                return -EINVAL;
+        WARN_ON_ONCE(counter->ctx->parent_ctx);
+        mutex_lock(&counter->mmap_mutex);
+        if (atomic_inc_not_zero(&counter->mmap_count)) {
+                if (nr_pages != counter->data->nr_pages)
+                        ret = -EINVAL;
+                goto unlock;
+        }
+        user_extra = nr_pages + 1;
+        user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+        /*
+         * Increase the limit linearly with more CPUs:
+         */
+        user_lock_limit *= num_online_cpus();
+        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+        extra = 0;
+        if (user_locked > user_lock_limit)
+                extra = user_locked - user_lock_limit;
+        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit >>= PAGE_SHIFT;
+        locked = vma->vm_mm->locked_vm + extra;
+        if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+                ret = -EPERM;
+                goto unlock;
+        }
+        WARN_ON(counter->data);
+        ret = perf_mmap_data_alloc(counter, nr_pages);
+        if (ret)
+                goto unlock;
+        atomic_set(&counter->mmap_count, 1);
+        atomic_long_add(user_extra, &user->locked_vm);
+        vma->vm_mm->locked_vm += extra;
+        counter->data->nr_locked = extra;
+        if (vma->vm_flags & VM_WRITE)
+                counter->data->writable = 1;
+unlock:
+        mutex_unlock(&counter->mmap_mutex);
+        vma->vm_flags |= VM_RESERVED;
+        vma->vm_ops = &perf_mmap_vmops;
+        return ret;
+}
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct perf_counter *counter = filp->private_data;
+        int retval;
+        mutex_lock(&inode->i_mutex);
+        retval = fasync_helper(fd, filp, on, &counter->fasync);
+        mutex_unlock(&inode->i_mutex);
+        if (retval < 0)
+                return retval;
+        return 0;
+}
+static const struct file_operations perf_fops = {
+        .release                = perf_release,
+        .read                   = perf_read,
+        .poll                   = perf_poll,
+        .unlocked_ioctl         = perf_ioctl,
+        .compat_ioctl           = perf_ioctl,
+        .mmap                   = perf_mmap,
+        .fasync                 = perf_fasync,
+};
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+        wake_up_all(&counter->waitq);
+        if (counter->pending_kill) {
+                kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+                counter->pending_kill = 0;
+        }
+}
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+        struct perf_counter *counter = container_of(entry,
+                        struct perf_counter, pending);
+        if (counter->pending_disable) {
+                counter->pending_disable = 0;
+                __perf_counter_disable(counter);
+        }
+        if (counter->pending_wakeup) {
+                counter->pending_wakeup = 0;
+                perf_counter_wakeup(counter);
+        }
+}
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+        PENDING_TAIL,
+};
+static void perf_pending_queue(struct perf_pending_entry *entry,
+                               void (*func)(struct perf_pending_entry *))
+{
+        struct perf_pending_entry **head;
+        if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+                return;
+        entry->func = func;
+        head = &get_cpu_var(perf_pending_head);
+        do {
+                entry->next = *head;
+        } while (cmpxchg(head, entry->next, entry) != entry->next);
+        set_perf_counter_pending();
+        put_cpu_var(perf_pending_head);
+}
+static int __perf_pending_run(void)
+{
+        struct perf_pending_entry *list;
+        int nr = 0;
+        list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+        while (list != PENDING_TAIL) {
+                void (*func)(struct perf_pending_entry *);
+                struct perf_pending_entry *entry = list;
+                list = list->next;
+                func = entry->func;
+                entry->next = NULL;
+                /*
+                 * Ensure we observe the unqueue before we issue the wakeup,
+                 * so that we won't be waiting forever.
+                 * -- see perf_not_pending().
+                 */
+                smp_wmb();
+                func(entry);
+                nr++;
+        }
+        return nr;
+}
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+        /*
+         * If we flush on whatever cpu we run, there is a chance we don't
+         * need to wait.
+         */
+        get_cpu();
+        __perf_pending_run();
+        put_cpu();
+        /*
+         * Ensure we see the proper queue state before going to sleep
+         * so that we do not miss the wakeup. -- see perf_pending_handle()
+         */
+        smp_rmb();
+        return counter->pending.next == NULL;
+}
+static void perf_pending_sync(struct perf_counter *counter)
+{
+        wait_event(counter->waitq, perf_not_pending(counter));
+}
+void perf_counter_do_pending(void)
+{
+        __perf_pending_run();
+}
+/*
+ * Callchain support -- arch specific
+ */
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        return NULL;
+}
+/*
+ * Output
+ */
+struct perf_output_handle {
+        struct perf_counter     *counter;
+        struct perf_mmap_data   *data;
+        unsigned long           head;
+        unsigned long           offset;
+        int                     nmi;
+        int                     sample;
+        int                     locked;
+        unsigned long           flags;
+};
+static bool perf_output_space(struct perf_mmap_data *data,
+                              unsigned int offset, unsigned int head)
+{
+        unsigned long tail;
+        unsigned long mask;
+        if (!data->writable)
+                return true;
+        mask = (data->nr_pages << PAGE_SHIFT) - 1;
+        /*
+         * Userspace could choose to issue a mb() before updating the tail
+         * pointer. So that all reads will be completed before the write is
+         * issued.
+         */
+        tail = ACCESS_ONCE(data->user_page->data_tail);
+        smp_rmb();
+        offset = (offset - tail) & mask;
+        head   = (head   - tail) & mask;
+        if ((int)(head - offset) < 0)
+                return false;
+        return true;
+}
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+        atomic_set(&handle->data->poll, POLL_IN);
+        if (handle->nmi) {
+                handle->counter->pending_wakeup = 1;
+                perf_pending_queue(&handle->counter->pending,
+                                   perf_pending_counter);
+        } else
+                perf_counter_wakeup(handle->counter);
+}
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+        struct perf_mmap_data *data = handle->data;
+        int cpu;
+        handle->locked = 0;
+        local_irq_save(handle->flags);
+        cpu = smp_processor_id();
+        if (in_nmi() && atomic_read(&data->lock) == cpu)
+                return;
+        while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+                cpu_relax();
+        handle->locked = 1;
+}
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+        struct perf_mmap_data *data = handle->data;
+        unsigned long head;
+        int cpu;
+        data->done_head = data->head;
+        if (!handle->locked)
+                goto out;
+again:
+        /*
+         * The xchg implies a full barrier that ensures all writes are done
+         * before we publish the new head, matched by a rmb() in userspace when
+         * reading this position.
+         */
+        while ((head = atomic_long_xchg(&data->done_head, 0)))
+                data->user_page->data_head = head;
+        /*
+         * NMI can happen here, which means we can miss a done_head update.
+         */
+        cpu = atomic_xchg(&data->lock, -1);
+        WARN_ON_ONCE(cpu != smp_processor_id());
+        /*
+         * Therefore we have to validate we did not indeed do so.
+         */
+        if (unlikely(atomic_long_read(&data->done_head))) {
+                /*
+                 * Since we had it locked, we can lock it again.
+                 */
+                while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+                        cpu_relax();
+                goto again;
+        }
+        if (atomic_xchg(&data->wakeup, 0))
+                perf_output_wakeup(handle);
+out:
+        local_irq_restore(handle->flags);
+}
+static void perf_output_copy(struct perf_output_handle *handle,
+                             const void *buf, unsigned int len)
+{
+        unsigned int pages_mask;
+        unsigned int offset;
+        unsigned int size;
+        void **pages;
+        offset          = handle->offset;
+        pages_mask      = handle->data->nr_pages - 1;
+        pages           = handle->data->data_pages;
+        do {
+                unsigned int page_offset;
+                int nr;
+                nr          = (offset >> PAGE_SHIFT) & pages_mask;
+                page_offset = offset & (PAGE_SIZE - 1);
+                size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+                memcpy(pages[nr] + page_offset, buf, size);
+                len         -= size;
+                buf         += size;
+                offset      += size;
+        } while (len);
+        handle->offset = offset;
+        /*
+         * Check we didn't copy past our reservation window, taking the
+         * possible unsigned int wrap into account.
+         */
+        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+#define perf_output_put(handle, x) \
+        perf_output_copy((handle), &(x), sizeof(x))
+static int perf_output_begin(struct perf_output_handle *handle,
+                             struct perf_counter *counter, unsigned int size,
+                             int nmi, int sample)
+{
+        struct perf_mmap_data *data;
+        unsigned int offset, head;
+        int have_lost;
+        struct {
+                struct perf_event_header header;
+                u64                      id;
+                u64                      lost;
+        } lost_event;
+        /*
+         * For inherited counters we send all the output towards the parent.
+         */
+        if (counter->parent)
+                counter = counter->parent;
+        rcu_read_lock();
+        data = rcu_dereference(counter->data);
+        if (!data)
+                goto out;
+        handle->data    = data;
+        handle->counter = counter;
+        handle->nmi     = nmi;
+        handle->sample  = sample;
+        if (!data->nr_pages)
+                goto fail;
+        have_lost = atomic_read(&data->lost);
+        if (have_lost)
+                size += sizeof(lost_event);
+        perf_output_lock(handle);
+        do {
+                offset = head = atomic_long_read(&data->head);
+                head += size;
+                if (unlikely(!perf_output_space(data, offset, head)))
+                        goto fail;
+        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+        handle->offset  = offset;
+        handle->head    = head;
+        if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+                atomic_set(&data->wakeup, 1);
+        if (have_lost) {
+                lost_event.header.type = PERF_EVENT_LOST;
+                lost_event.header.misc = 0;
+                lost_event.header.size = sizeof(lost_event);
+                lost_event.id          = counter->id;
+                lost_event.lost        = atomic_xchg(&data->lost, 0);
+                perf_output_put(handle, lost_event);
+        }
+        return 0;
+fail:
+        atomic_inc(&data->lost);
+        perf_output_unlock(handle);
+out:
+        rcu_read_unlock();
+        return -ENOSPC;
+}
+static void perf_output_end(struct perf_output_handle *handle)
+{
+        struct perf_counter *counter = handle->counter;
+        struct perf_mmap_data *data = handle->data;
+        int wakeup_events = counter->attr.wakeup_events;
+        if (handle->sample && wakeup_events) {
+                int events = atomic_inc_return(&data->events);
+                if (events >= wakeup_events) {
+                        atomic_sub(wakeup_events, &data->events);
+                        atomic_set(&data->wakeup, 1);
+                }
+        }
+        perf_output_unlock(handle);
+        rcu_read_unlock();
+}
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+        /*
+         * only top level counters have the pid namespace they were created in
+         */
+        if (counter->parent)
+                counter = counter->parent;
+        return task_tgid_nr_ns(p, counter->ns);
+}
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+        /*
+         * only top level counters have the pid namespace they were created in
+         */
+        if (counter->parent)
+                counter = counter->parent;
+        return task_pid_nr_ns(p, counter->ns);
+}
+static void perf_output_read_one(struct perf_output_handle *handle,
+                                 struct perf_counter *counter)
+{
+        u64 read_format = counter->attr.read_format;
+        u64 values[4];
+        int n = 0;
+        values[n++] = atomic64_read(&counter->count);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+                values[n++] = counter->total_time_enabled +
+                        atomic64_read(&counter->child_total_time_enabled);
+        }
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+                values[n++] = counter->total_time_running +
+                        atomic64_read(&counter->child_total_time_running);
+        }
+        if (read_format & PERF_FORMAT_ID)
+                values[n++] = primary_counter_id(counter);
+        perf_output_copy(handle, values, n * sizeof(u64));
+}
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+                            struct perf_counter *counter)
+{
+        struct perf_counter *leader = counter->group_leader, *sub;
+        u64 read_format = counter->attr.read_format;
+        u64 values[5];
+        int n = 0;
+        values[n++] = 1 + leader->nr_siblings;
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                values[n++] = leader->total_time_enabled;
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                values[n++] = leader->total_time_running;
+        if (leader != counter)
+                leader->pmu->read(leader);
+        values[n++] = atomic64_read(&leader->count);
+        if (read_format & PERF_FORMAT_ID)
+                values[n++] = primary_counter_id(leader);
+        perf_output_copy(handle, values, n * sizeof(u64));
+        list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+                n = 0;
+                if (sub != counter)
+                        sub->pmu->read(sub);
+                values[n++] = atomic64_read(&sub->count);
+                if (read_format & PERF_FORMAT_ID)
+                        values[n++] = primary_counter_id(sub);
+                perf_output_copy(handle, values, n * sizeof(u64));
+        }
+}
+static void perf_output_read(struct perf_output_handle *handle,
+                             struct perf_counter *counter)
+{
+        if (counter->attr.read_format & PERF_FORMAT_GROUP)
+                perf_output_read_group(handle, counter);
+        else
+                perf_output_read_one(handle, counter);
+}
+void perf_counter_output(struct perf_counter *counter, int nmi,
+                                struct perf_sample_data *data)
+{
+        int ret;
+        u64 sample_type = counter->attr.sample_type;
+        struct perf_output_handle handle;
+        struct perf_event_header header;
+        u64 ip;
+        struct {
+                u32 pid, tid;
+        } tid_entry;
+        struct perf_callchain_entry *callchain = NULL;
+        int callchain_size = 0;
+        u64 time;
+        struct {
+                u32 cpu, reserved;
+        } cpu_entry;
+        header.type = PERF_EVENT_SAMPLE;
+        header.size = sizeof(header);
+        header.misc = 0;
+        header.misc |= perf_misc_flags(data->regs);
+        if (sample_type & PERF_SAMPLE_IP) {
+                ip = perf_instruction_pointer(data->regs);
+                header.size += sizeof(ip);
+        }
+        if (sample_type & PERF_SAMPLE_TID) {
+                /* namespace issues */
+                tid_entry.pid = perf_counter_pid(counter, current);
+                tid_entry.tid = perf_counter_tid(counter, current);
+                header.size += sizeof(tid_entry);
+        }
+        if (sample_type & PERF_SAMPLE_TIME) {
+                /*
+                 * Maybe do better on x86 and provide cpu_clock_nmi()
+                 */
+                time = sched_clock();
+                header.size += sizeof(u64);
+        }
+        if (sample_type & PERF_SAMPLE_ADDR)
+                header.size += sizeof(u64);
+        if (sample_type & PERF_SAMPLE_ID)
+                header.size += sizeof(u64);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                header.size += sizeof(u64);
+        if (sample_type & PERF_SAMPLE_CPU) {
+                header.size += sizeof(cpu_entry);
+                cpu_entry.cpu = raw_smp_processor_id();
+                cpu_entry.reserved = 0;
+        }
+        if (sample_type & PERF_SAMPLE_PERIOD)
+                header.size += sizeof(u64);
+        if (sample_type & PERF_SAMPLE_READ)
+                header.size += perf_counter_read_size(counter);
+        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+                callchain = perf_callchain(data->regs);
+                if (callchain) {
+                        callchain_size = (1 + callchain->nr) * sizeof(u64);
+                        header.size += callchain_size;
+                } else
+                        header.size += sizeof(u64);
+        }
+        if (sample_type & PERF_SAMPLE_RAW) {
+                int size = sizeof(u32);
+                if (data->raw)
+                        size += data->raw->size;
+                else
+                        size += sizeof(u32);
+                WARN_ON_ONCE(size & (sizeof(u64)-1));
+                header.size += size;
+        }
+        ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
+        if (ret)
+                return;
+        perf_output_put(&handle, header);
+        if (sample_type & PERF_SAMPLE_IP)
+                perf_output_put(&handle, ip);
+        if (sample_type & PERF_SAMPLE_TID)
+                perf_output_put(&handle, tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                perf_output_put(&handle, time);
+        if (sample_type & PERF_SAMPLE_ADDR)
+                perf_output_put(&handle, data->addr);
+        if (sample_type & PERF_SAMPLE_ID) {
+                u64 id = primary_counter_id(counter);
+                perf_output_put(&handle, id);
+        }
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                perf_output_put(&handle, counter->id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                perf_output_put(&handle, cpu_entry);
+        if (sample_type & PERF_SAMPLE_PERIOD)
+                perf_output_put(&handle, data->period);
+        if (sample_type & PERF_SAMPLE_READ)
+                perf_output_read(&handle, counter);
+        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+                if (callchain)
+                        perf_output_copy(&handle, callchain, callchain_size);
+                else {
+                        u64 nr = 0;
+                        perf_output_put(&handle, nr);
+                }
+        }
+        if (sample_type & PERF_SAMPLE_RAW) {
+                if (data->raw) {
+                        perf_output_put(&handle, data->raw->size);
+                        perf_output_copy(&handle, data->raw->data, data->raw->size);
+                } else {
+                        struct {
+                                u32     size;
+                                u32     data;
+                        } raw = {
+                                .size = sizeof(u32),
+                                .data = 0,
+                        };
+                        perf_output_put(&handle, raw);
+                }
+        }
+        perf_output_end(&handle);
+}
+/*
+ * read event
+ */
+struct perf_read_event {
+        struct perf_event_header        header;
+        u32                             pid;
+        u32                             tid;
+};
+static void
+perf_counter_read_event(struct perf_counter *counter,
+                        struct task_struct *task)
+{
+        struct perf_output_handle handle;
+        struct perf_read_event event = {
+                .header = {
+                        .type = PERF_EVENT_READ,
+                        .misc = 0,
+                        .size = sizeof(event) + perf_counter_read_size(counter),
+                },
+                .pid = perf_counter_pid(counter, task),
+                .tid = perf_counter_tid(counter, task),
+        };
+        int ret;
+        ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+        if (ret)
+                return;
+        perf_output_put(&handle, event);
+        perf_output_read(&handle, counter);
+        perf_output_end(&handle);
+}
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
+ */
+struct perf_task_event {
+        struct task_struct              *task;
+        struct perf_counter_context     *task_ctx;
+        struct {
+                struct perf_event_header        header;
+                u32                             pid;
+                u32                             ppid;
+                u32                             tid;
+                u32                             ptid;
+        } event;
+};
+static void perf_counter_task_output(struct perf_counter *counter,
+                                     struct perf_task_event *task_event)
+{
+        struct perf_output_handle handle;
+        int size = task_event->event.header.size;
+        struct task_struct *task = task_event->task;
+        int ret = perf_output_begin(&handle, counter, size, 0, 0);
+        if (ret)
+                return;
+        task_event->event.pid = perf_counter_pid(counter, task);
+        task_event->event.ppid = perf_counter_pid(counter, current);
+        task_event->event.tid = perf_counter_tid(counter, task);
+        task_event->event.ptid = perf_counter_tid(counter, current);
+        perf_output_put(&handle, task_event->event);
+        perf_output_end(&handle);
+}
+static int perf_counter_task_match(struct perf_counter *counter)
+{
+        if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
+                return 1;
+        return 0;
+}
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+                                  struct perf_task_event *task_event)
+{
+        struct perf_counter *counter;
+        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+                return;
+        rcu_read_lock();
+        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+                if (perf_counter_task_match(counter))
+                        perf_counter_task_output(counter, task_event);
+        }
+        rcu_read_unlock();
+}
+static void perf_counter_task_event(struct perf_task_event *task_event)
+{
+        struct perf_cpu_context *cpuctx;
+        struct perf_counter_context *ctx = task_event->task_ctx;
+        cpuctx = &get_cpu_var(perf_cpu_context);
+        perf_counter_task_ctx(&cpuctx->ctx, task_event);
+        put_cpu_var(perf_cpu_context);
+        rcu_read_lock();
+        if (!ctx)
+                ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
+        if (ctx)
+                perf_counter_task_ctx(ctx, task_event);
+        rcu_read_unlock();
+}
+static void perf_counter_task(struct task_struct *task,
+                              struct perf_counter_context *task_ctx,
+                              int new)
+{
+        struct perf_task_event task_event;
+        if (!atomic_read(&nr_comm_counters) &&
+            !atomic_read(&nr_mmap_counters) &&
+            !atomic_read(&nr_task_counters))
+                return;
+        task_event = (struct perf_task_event){
+                .task     = task,
+                .task_ctx = task_ctx,
+                .event    = {
+                        .header = {
+                                .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
+                                .misc = 0,
+                                .size = sizeof(task_event.event),
+                        },
+                        /* .pid  */
+                        /* .ppid */
+                        /* .tid  */
+                        /* .ptid */
+                },
+        };
+        perf_counter_task_event(&task_event);
+}
+void perf_counter_fork(struct task_struct *task)
+{
+        perf_counter_task(task, NULL, 1);
+}
+/*
+ * comm tracking
+ */
+struct perf_comm_event {
+        struct task_struct      *task;
+        char                    *comm;
+        int                     comm_size;
+        struct {
+                struct perf_event_header        header;
+                u32                             pid;
+                u32                             tid;
+        } event;
+};
+static void perf_counter_comm_output(struct perf_counter *counter,
+                                     struct perf_comm_event *comm_event)
+{
+        struct perf_output_handle handle;
+        int size = comm_event->event.header.size;
+        int ret = perf_output_begin(&handle, counter, size, 0, 0);
+        if (ret)
+                return;
+        comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+        comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+        perf_output_put(&handle, comm_event->event);
+        perf_output_copy(&handle, comm_event->comm,
+                                   comm_event->comm_size);
+        perf_output_end(&handle);
+}
+static int perf_counter_comm_match(struct perf_counter *counter)
+{
+        if (counter->attr.comm)
+                return 1;
+        return 0;
+}
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+                                  struct perf_comm_event *comm_event)
+{
+        struct perf_counter *counter;
+        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+                return;
+        rcu_read_lock();
+        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+                if (perf_counter_comm_match(counter))
+                        perf_counter_comm_output(counter, comm_event);
+        }
+        rcu_read_unlock();
+}
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+        struct perf_cpu_context *cpuctx;
+        struct perf_counter_context *ctx;
+        unsigned int size;
+        char comm[TASK_COMM_LEN];
+        memset(comm, 0, sizeof(comm));
+        strncpy(comm, comm_event->task->comm, sizeof(comm));
+        size = ALIGN(strlen(comm)+1, sizeof(u64));
+        comm_event->comm = comm;
+        comm_event->comm_size = size;
+        comm_event->event.header.size = sizeof(comm_event->event) + size;
+        cpuctx = &get_cpu_var(perf_cpu_context);
+        perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+        put_cpu_var(perf_cpu_context);
+        rcu_read_lock();
+        /*
+         * doesn't really matter which of the child contexts the
+         * events ends up in.
+         */
+        ctx = rcu_dereference(current->perf_counter_ctxp);
+        if (ctx)
+                perf_counter_comm_ctx(ctx, comm_event);
+        rcu_read_unlock();
+}
+void perf_counter_comm(struct task_struct *task)
+{
+        struct perf_comm_event comm_event;
+        if (task->perf_counter_ctxp)
+                perf_counter_enable_on_exec(task);
+        if (!atomic_read(&nr_comm_counters))
+                return;
+        comm_event = (struct perf_comm_event){
+                .task   = task,
+                /* .comm      */
+                /* .comm_size */
+                .event  = {
+                        .header = {
+                                .type = PERF_EVENT_COMM,
+                                .misc = 0,
+                                /* .size */
+                        },
+                        /* .pid */
+                        /* .tid */
+                },
+        };
+        perf_counter_comm_event(&comm_event);
+}
+/*
+ * mmap tracking
+ */
+struct perf_mmap_event {
+        struct vm_area_struct   *vma;
+        const char              *file_name;
+        int                     file_size;
+        struct {
+                struct perf_event_header        header;
+                u32                             pid;
+                u32                             tid;
+                u64                             start;
+                u64                             len;
+                u64                             pgoff;
+        } event;
+};
+static void perf_counter_mmap_output(struct perf_counter *counter,
+                                     struct perf_mmap_event *mmap_event)
+{
+        struct perf_output_handle handle;
+        int size = mmap_event->event.header.size;
+        int ret = perf_output_begin(&handle, counter, size, 0, 0);
+        if (ret)
+                return;
+        mmap_event->event.pid = perf_counter_pid(counter, current);
+        mmap_event->event.tid = perf_counter_tid(counter, current);
+        perf_output_put(&handle, mmap_event->event);
+        perf_output_copy(&handle, mmap_event->file_name,
+                                   mmap_event->file_size);
+        perf_output_end(&handle);
+}
+static int perf_counter_mmap_match(struct perf_counter *counter,
+                                   struct perf_mmap_event *mmap_event)
+{
+        if (counter->attr.mmap)
+                return 1;
+        return 0;
+}
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+                                  struct perf_mmap_event *mmap_event)
+{
+        struct perf_counter *counter;
+        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+                return;
+        rcu_read_lock();
+        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+                if (perf_counter_mmap_match(counter, mmap_event))
+                        perf_counter_mmap_output(counter, mmap_event);
+        }
+        rcu_read_unlock();
+}
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+        struct perf_cpu_context *cpuctx;
+        struct perf_counter_context *ctx;
+        struct vm_area_struct *vma = mmap_event->vma;
+        struct file *file = vma->vm_file;
+        unsigned int size;
+        char tmp[16];
+        char *buf = NULL;
+        const char *name;
+        memset(tmp, 0, sizeof(tmp));
+        if (file) {
+                /*
+                 * d_path works from the end of the buffer backwards, so we
+                 * need to add enough zero bytes after the string to handle
+                 * the 64bit alignment we do later.
+                 */
+                buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+                if (!buf) {
+                        name = strncpy(tmp, "//enomem", sizeof(tmp));
+                        goto got_name;
+                }
+                name = d_path(&file->f_path, buf, PATH_MAX);
+                if (IS_ERR(name)) {
+                        name = strncpy(tmp, "//toolong", sizeof(tmp));
+                        goto got_name;
+                }
+        } else {
+                if (arch_vma_name(mmap_event->vma)) {
+                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                                       sizeof(tmp));
+                        goto got_name;
+                }
+                if (!vma->vm_mm) {
+                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
+                        goto got_name;
+                }
+                name = strncpy(tmp, "//anon", sizeof(tmp));
+                goto got_name;
+        }
+got_name:
+        size = ALIGN(strlen(name)+1, sizeof(u64));
+        mmap_event->file_name = name;
+        mmap_event->file_size = size;
+        mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+        cpuctx = &get_cpu_var(perf_cpu_context);
+        perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+        put_cpu_var(perf_cpu_context);
+        rcu_read_lock();
+        /*
+         * doesn't really matter which of the child contexts the
+         * events ends up in.
+         */
+        ctx = rcu_dereference(current->perf_counter_ctxp);
+        if (ctx)
+                perf_counter_mmap_ctx(ctx, mmap_event);
+        rcu_read_unlock();
+        kfree(buf);
+}
+void __perf_counter_mmap(struct vm_area_struct *vma)
+{
+        struct perf_mmap_event mmap_event;
+        if (!atomic_read(&nr_mmap_counters))
+                return;
+        mmap_event = (struct perf_mmap_event){
+                .vma    = vma,
+                /* .file_name */
+                /* .file_size */
+                .event  = {
+                        .header = {
+                                .type = PERF_EVENT_MMAP,
+                                .misc = 0,
+                                /* .size */
+                        },
+                        /* .pid */
+                        /* .tid */
+                        .start  = vma->vm_start,
+                        .len    = vma->vm_end - vma->vm_start,
+                        .pgoff  = vma->vm_pgoff,
+                },
+        };
+        perf_counter_mmap_event(&mmap_event);
+}
+/*
+ * IRQ throttle logging
+ */
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+        struct perf_output_handle handle;
+        int ret;
+        struct {
+                struct perf_event_header        header;
+                u64                             time;
+                u64                             id;
+                u64                             stream_id;
+        } throttle_event = {
+                .header = {
+                        .type = PERF_EVENT_THROTTLE,
+                        .misc = 0,
+                        .size = sizeof(throttle_event),
+                },
+                .time           = sched_clock(),
+                .id             = primary_counter_id(counter),
+                .stream_id      = counter->id,
+        };
+        if (enable)
+                throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+        ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
+        if (ret)
+                return;
+        perf_output_put(&handle, throttle_event);
+        perf_output_end(&handle);
+}
+/*
+ * Generic counter overflow handling, sampling.
+ */
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+                          struct perf_sample_data *data)
+{
+        int events = atomic_read(&counter->event_limit);
+        int throttle = counter->pmu->unthrottle != NULL;
+        struct hw_perf_counter *hwc = &counter->hw;
+        int ret = 0;
+        if (!throttle) {
+                hwc->interrupts++;
+        } else {
+                if (hwc->interrupts != MAX_INTERRUPTS) {
+                        hwc->interrupts++;
+                        if (HZ * hwc->interrupts >
+                                        (u64)sysctl_perf_counter_sample_rate) {
+                                hwc->interrupts = MAX_INTERRUPTS;
+                                perf_log_throttle(counter, 0);
+                                ret = 1;
+                        }
+                } else {
+                        /*
+                         * Keep re-disabling counters even though on the previous
+                         * pass we disabled it - just in case we raced with a
+                         * sched-in and the counter got enabled again:
+                         */
+                        ret = 1;
+                }
+        }
+        if (counter->attr.freq) {
+                u64 now = sched_clock();
+                s64 delta = now - hwc->freq_stamp;
+                hwc->freq_stamp = now;
+                if (delta > 0 && delta < TICK_NSEC)
+                        perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+        }
+        /*
+         * XXX event_limit might not quite work as expected on inherited
+         * counters
+         */
+        counter->pending_kill = POLL_IN;
+        if (events && atomic_dec_and_test(&counter->event_limit)) {
+                ret = 1;
+                counter->pending_kill = POLL_HUP;
+                if (nmi) {
+                        counter->pending_disable = 1;
+                        perf_pending_queue(&counter->pending,
+                                           perf_pending_counter);
+                } else
+                        perf_counter_disable(counter);
+        }
+        perf_counter_output(counter, nmi, data);
+        return ret;
+}
+/*
+ * Generic software counter infrastructure
+ */
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        u64 period = hwc->last_period;
+        u64 nr, offset;
+        s64 old, val;
+        hwc->last_period = hwc->sample_period;
+again:
+        old = val = atomic64_read(&hwc->period_left);
+        if (val < 0)
+                return 0;
+        nr = div64_u64(period + val, period);
+        offset = nr * period;
+        val -= offset;
+        if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+                goto again;
+        return nr;
+}
+static void perf_swcounter_overflow(struct perf_counter *counter,
+                                    int nmi, struct perf_sample_data *data)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        u64 overflow;
+        data->period = counter->hw.last_period;
+        overflow = perf_swcounter_set_period(counter);
+        if (hwc->interrupts == MAX_INTERRUPTS)
+                return;
+        for (; overflow; overflow--) {
+                if (perf_counter_overflow(counter, nmi, data)) {
+                        /*
+                         * We inhibit the overflow from happening when
+                         * hwc->interrupts == MAX_INTERRUPTS.
+                         */
+                        break;
+                }
+        }
+}
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
+{
+        /*
+         * Nothing to do, we already reset hwc->interrupts.
+         */
+}
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+                               int nmi, struct perf_sample_data *data)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        atomic64_add(nr, &counter->count);
+        if (!hwc->sample_period)
+                return;
+        if (!data->regs)
+                return;
+        if (!atomic64_add_negative(nr, &hwc->period_left))
+                perf_swcounter_overflow(counter, nmi, data);
+}
+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+        /*
+         * The counter is active, we're good!
+         */
+        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+                return 1;
+        /*
+         * The counter is off/error, not counting.
+         */
+        if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+                return 0;
+        /*
+         * The counter is inactive, if the context is active
+         * we're part of a group that didn't make it on the 'pmu',
+         * not counting.
+         */
+        if (counter->ctx->is_active)
+                return 0;
+        /*
+         * We're inactive and the context is too, this means the
+         * task is scheduled out, we're counting events that happen
+         * to us, like migration events.
+         */
+        return 1;
+}
+static int perf_swcounter_match(struct perf_counter *counter,
+                                enum perf_type_id type,
+                                u32 event, struct pt_regs *regs)
+{
+        if (!perf_swcounter_is_counting(counter))
+                return 0;
+        if (counter->attr.type != type)
+                return 0;
+        if (counter->attr.config != event)
+                return 0;
+        if (regs) {
+                if (counter->attr.exclude_user && user_mode(regs))
+                        return 0;
+                if (counter->attr.exclude_kernel && !user_mode(regs))
+                        return 0;
+        }
+        return 1;
+}
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+                                     enum perf_type_id type,
+                                     u32 event, u64 nr, int nmi,
+                                     struct perf_sample_data *data)
+{
+        struct perf_counter *counter;
+        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+                return;
+        rcu_read_lock();
+        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+                if (perf_swcounter_match(counter, type, event, data->regs))
+                        perf_swcounter_add(counter, nr, nmi, data);
+        }
+        rcu_read_unlock();
+}
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+        if (in_nmi())
+                return &cpuctx->recursion[3];
+        if (in_irq())
+                return &cpuctx->recursion[2];
+        if (in_softirq())
+                return &cpuctx->recursion[1];
+        return &cpuctx->recursion[0];
+}
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
+                                    u64 nr, int nmi,
+                                    struct perf_sample_data *data)
+{
+        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        int *recursion = perf_swcounter_recursion_context(cpuctx);
+        struct perf_counter_context *ctx;
+        if (*recursion)
+                goto out;
+        (*recursion)++;
+        barrier();
+        perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+                                 nr, nmi, data);
+        rcu_read_lock();
+        /*
+         * doesn't really matter which of the child contexts the
+         * events ends up in.
+         */
+        ctx = rcu_dereference(current->perf_counter_ctxp);
+        if (ctx)
+                perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+        rcu_read_unlock();
+        barrier();
+        (*recursion)--;
+out:
+        put_cpu_var(perf_cpu_context);
+}
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
+                            struct pt_regs *regs, u64 addr)
+{
+        struct perf_sample_data data = {
+                .regs = regs,
+                .addr = addr,
+        };
+        do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+}
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+}
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        if (hwc->sample_period) {
+                hwc->last_period = hwc->sample_period;
+                perf_swcounter_set_period(counter);
+        }
+        return 0;
+}
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+}
+static const struct pmu perf_ops_generic = {
+        .enable         = perf_swcounter_enable,
+        .disable        = perf_swcounter_disable,
+        .read           = perf_swcounter_read,
+        .unthrottle     = perf_swcounter_unthrottle,
+};
+/*
+ * hrtimer based swcounter callback
+ */
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+        enum hrtimer_restart ret = HRTIMER_RESTART;
+        struct perf_sample_data data;
+        struct perf_counter *counter;
+        u64 period;
+        counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+        counter->pmu->read(counter);
+        data.addr = 0;
+        data.regs = get_irq_regs();
+        /*
+         * In case we exclude kernel IPs or are somehow not in interrupt
+         * context, provide the next best thing, the user IP.
+         */
+        if ((counter->attr.exclude_kernel || !data.regs) &&
+                        !counter->attr.exclude_user)
+                data.regs = task_pt_regs(current);
+        if (data.regs) {
+                if (perf_counter_overflow(counter, 0, &data))
+                        ret = HRTIMER_NORESTART;
+        }
+        period = max_t(u64, 10000, counter->hw.sample_period);
+        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+        return ret;
+}
+/*
+ * Software counter: cpu wall time clock
+ */
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+        int cpu = raw_smp_processor_id();
+        s64 prev;
+        u64 now;
+        now = cpu_clock(cpu);
+        prev = atomic64_read(&counter->hw.prev_count);
+        atomic64_set(&counter->hw.prev_count, now);
+        atomic64_add(now - prev, &counter->count);
+}
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        int cpu = raw_smp_processor_id();
+        atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swcounter_hrtimer;
+        if (hwc->sample_period) {
+                u64 period = max_t(u64, 10000, hwc->sample_period);
+                __hrtimer_start_range_ns(&hwc->hrtimer,
+                                ns_to_ktime(period), 0,
+                                HRTIMER_MODE_REL, 0);
+        }
+        return 0;
+}
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+        if (counter->hw.sample_period)
+                hrtimer_cancel(&counter->hw.hrtimer);
+        cpu_clock_perf_counter_update(counter);
+}
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+        cpu_clock_perf_counter_update(counter);
+}
+static const struct pmu perf_ops_cpu_clock = {
+        .enable         = cpu_clock_perf_counter_enable,
+        .disable        = cpu_clock_perf_counter_disable,
+        .read           = cpu_clock_perf_counter_read,
+};
+/*
+ * Software counter: task time clock
+ */
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+        u64 prev;
+        s64 delta;
+        prev = atomic64_xchg(&counter->hw.prev_count, now);
+        delta = now - prev;
+        atomic64_add(delta, &counter->count);
+}
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+        struct hw_perf_counter *hwc = &counter->hw;
+        u64 now;
+        now = counter->ctx->time;
+        atomic64_set(&hwc->prev_count, now);
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swcounter_hrtimer;
+        if (hwc->sample_period) {
+                u64 period = max_t(u64, 10000, hwc->sample_period);
+                __hrtimer_start_range_ns(&hwc->hrtimer,
+                                ns_to_ktime(period), 0,
+                                HRTIMER_MODE_REL, 0);
+        }
+        return 0;
+}
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+        if (counter->hw.sample_period)
+                hrtimer_cancel(&counter->hw.hrtimer);
+        task_clock_perf_counter_update(counter, counter->ctx->time);
+}
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+        u64 time;
+        if (!in_nmi()) {
+                update_context_time(counter->ctx);
+                time = counter->ctx->time;
+        } else {
+                u64 now = perf_clock();
+                u64 delta = now - counter->ctx->timestamp;
+                time = counter->ctx->time + delta;
+        }
+        task_clock_perf_counter_update(counter, time);
+}
+static const struct pmu perf_ops_task_clock = {
+        .enable         = task_clock_perf_counter_enable,
+        .disable        = task_clock_perf_counter_disable,
+        .read           = task_clock_perf_counter_read,
+};
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+                          int entry_size)
+{
+        struct perf_raw_record raw = {
+                .size = entry_size,
+                .data = record,
+        };
+        struct perf_sample_data data = {
+                .regs = get_irq_regs(),
+                .addr = addr,
+                .raw = &raw,
+        };
+        if (!data.regs)
+                data.regs = task_pt_regs(current);
+        do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+}
+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+        ftrace_profile_disable(counter->attr.config);
+}
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+        /*
+         * Raw tracepoint data is a severe data leak, only allow root to
+         * have these.
+         */
+        if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+                        !capable(CAP_SYS_ADMIN))
+                return ERR_PTR(-EPERM);
+        if (ftrace_profile_enable(counter->attr.config))
+                return NULL;
+        counter->destroy = tp_perf_counter_destroy;
+        return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+        return NULL;
+}
+#endif
+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+        u64 event = counter->attr.config;
+        WARN_ON(counter->parent);
+        atomic_dec(&perf_swcounter_enabled[event]);
+}
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
+{
+        const struct pmu *pmu = NULL;
+        u64 event = counter->attr.config;
+        /*
+         * Software counters (currently) can't in general distinguish
+         * between user, kernel and hypervisor events.
+         * However, context switches and cpu migrations are considered
+         * to be kernel events, and page faults are never hypervisor
+         * events.
+         */
+        switch (event) {
+        case PERF_COUNT_SW_CPU_CLOCK:
+                pmu = &perf_ops_cpu_clock;
+                break;
+        case PERF_COUNT_SW_TASK_CLOCK:
+                /*
+                 * If the user instantiates this as a per-cpu counter,
+                 * use the cpu_clock counter instead.
+                 */
+                if (counter->ctx->task)
+                        pmu = &perf_ops_task_clock;
+                else
+                        pmu = &perf_ops_cpu_clock;
+                break;
+        case PERF_COUNT_SW_PAGE_FAULTS:
+        case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+        case PERF_COUNT_SW_CONTEXT_SWITCHES:
+        case PERF_COUNT_SW_CPU_MIGRATIONS:
+                if (!counter->parent) {
+                        atomic_inc(&perf_swcounter_enabled[event]);
+                        counter->destroy = sw_perf_counter_destroy;
+                }
+                pmu = &perf_ops_generic;
+                break;
+        }
+        return pmu;
+}
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(struct perf_counter_attr *attr,
+                   int cpu,
+                   struct perf_counter_context *ctx,
+                   struct perf_counter *group_leader,
+                   struct perf_counter *parent_counter,
+                   gfp_t gfpflags)
+{
+        const struct pmu *pmu;
+        struct perf_counter *counter;
+        struct hw_perf_counter *hwc;
+        long err;
+        counter = kzalloc(sizeof(*counter), gfpflags);
+        if (!counter)
+                return ERR_PTR(-ENOMEM);
+        /*
+         * Single counters are their own group leaders, with an
+         * empty sibling list:
+         */
+        if (!group_leader)
+                group_leader = counter;
+        mutex_init(&counter->child_mutex);
+        INIT_LIST_HEAD(&counter->child_list);
+        INIT_LIST_HEAD(&counter->list_entry);
+        INIT_LIST_HEAD(&counter->event_entry);
+        INIT_LIST_HEAD(&counter->sibling_list);
+        init_waitqueue_head(&counter->waitq);
+        mutex_init(&counter->mmap_mutex);
+        counter->cpu            = cpu;
+        counter->attr           = *attr;
+        counter->group_leader   = group_leader;
+        counter->pmu            = NULL;
+        counter->ctx            = ctx;
+        counter->oncpu          = -1;
+        counter->parent         = parent_counter;
+        counter->ns             = get_pid_ns(current->nsproxy->pid_ns);
+        counter->id             = atomic64_inc_return(&perf_counter_id);
+        counter->state          = PERF_COUNTER_STATE_INACTIVE;
+        if (attr->disabled)
+                counter->state = PERF_COUNTER_STATE_OFF;
+        pmu = NULL;
+        hwc = &counter->hw;
+        hwc->sample_period = attr->sample_period;
+        if (attr->freq && attr->sample_freq)
+                hwc->sample_period = 1;
+        atomic64_set(&hwc->period_left, hwc->sample_period);
+        /*
+         * we currently do not support PERF_FORMAT_GROUP on inherited counters
+         */
+        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+                goto done;
+        switch (attr->type) {
+        case PERF_TYPE_RAW:
+        case PERF_TYPE_HARDWARE:
+        case PERF_TYPE_HW_CACHE:
+                pmu = hw_perf_counter_init(counter);
+                break;
+        case PERF_TYPE_SOFTWARE:
+                pmu = sw_perf_counter_init(counter);
+                break;
+        case PERF_TYPE_TRACEPOINT:
+                pmu = tp_perf_counter_init(counter);
+                break;
+        default:
+                break;
+        }
+done:
+        err = 0;
+        if (!pmu)
+                err = -EINVAL;
+        else if (IS_ERR(pmu))
+                err = PTR_ERR(pmu);
+        if (err) {
+                if (counter->ns)
+                        put_pid_ns(counter->ns);
+                kfree(counter);
+                return ERR_PTR(err);
+        }
+        counter->pmu = pmu;
+        if (!counter->parent) {
+                atomic_inc(&nr_counters);
+                if (counter->attr.mmap)
+                        atomic_inc(&nr_mmap_counters);
+                if (counter->attr.comm)
+                        atomic_inc(&nr_comm_counters);
+                if (counter->attr.task)
+                        atomic_inc(&nr_task_counters);
+        }
+        return counter;
+}
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+                          struct perf_counter_attr *attr)
+{
+        int ret;
+        u32 size;
+        if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+                return -EFAULT;
+        /*
+         * zero the full structure, so that a short copy will be nice.
+         */
+        memset(attr, 0, sizeof(*attr));
+        ret = get_user(size, &uattr->size);
+        if (ret)
+                return ret;
+        if (size > PAGE_SIZE)   /* silly large */
+                goto err_size;
+        if (!size)              /* abi compat */
+                size = PERF_ATTR_SIZE_VER0;
+        if (size < PERF_ATTR_SIZE_VER0)
+                goto err_size;
+        /*
+         * If we're handed a bigger struct than we know of,
+         * ensure all the unknown bits are 0.
+         */
+        if (size > sizeof(*attr)) {
+                unsigned long val;
+                unsigned long __user *addr;
+                unsigned long __user *end;
+                addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+                                sizeof(unsigned long));
+                end  = PTR_ALIGN((void __user *)uattr + size,
+                                sizeof(unsigned long));
+                for (; addr < end; addr += sizeof(unsigned long)) {
+                        ret = get_user(val, addr);
+                        if (ret)
+                                return ret;
+                        if (val)
+                                goto err_size;
+                }
+        }
+        ret = copy_from_user(attr, uattr, size);
+        if (ret)
+                return -EFAULT;
+        /*
+         * If the type exists, the corresponding creation will verify
+         * the attr->config.
+         */
+        if (attr->type >= PERF_TYPE_MAX)
+                return -EINVAL;
+        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+                return -EINVAL;
+        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+                return -EINVAL;
+        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+                return -EINVAL;
+out:
+        return ret;
+err_size:
+        put_user(sizeof(*attr), &uattr->size);
+        ret = -E2BIG;
+        goto out;
+}
+/**
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
+ *
+ * @attr_uptr:  event type attributes for monitoring/sampling
+ * @pid:                target pid
+ * @cpu:                target cpu
+ * @group_fd:           group leader counter fd
+ */
+SYSCALL_DEFINE5(perf_counter_open,
+                struct perf_counter_attr __user *, attr_uptr,
+                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+        struct perf_counter *counter, *group_leader;
+        struct perf_counter_attr attr;
+        struct perf_counter_context *ctx;
+        struct file *counter_file = NULL;
+        struct file *group_file = NULL;
+        int fput_needed = 0;
+        int fput_needed2 = 0;
+        int ret;
+        /* for future expandability... */
+        if (flags)
+                return -EINVAL;
+        ret = perf_copy_attr(attr_uptr, &attr);
+        if (ret)
+                return ret;
+        if (!attr.exclude_kernel) {
+                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                        return -EACCES;
+        }
+        if (attr.freq) {
+                if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+                        return -EINVAL;
+        }
+        /*
+         * Get the target context (task or percpu):
+         */
+        ctx = find_get_context(pid, cpu);
+        if (IS_ERR(ctx))
+                return PTR_ERR(ctx);
+        /*
+         * Look up the group leader (we will attach this counter to it):
+         */
+        group_leader = NULL;
+        if (group_fd != -1) {
+                ret = -EINVAL;
+                group_file = fget_light(group_fd, &fput_needed);
+                if (!group_file)
+                        goto err_put_context;
+                if (group_file->f_op != &perf_fops)
+                        goto err_put_context;
+                group_leader = group_file->private_data;
+                /*
+                 * Do not allow a recursive hierarchy (this new sibling
+                 * becoming part of another group-sibling):
+                 */
+                if (group_leader->group_leader != group_leader)
+                        goto err_put_context;
+                /*
+                 * Do not allow to attach to a group in a different
+                 * task or CPU context:
+                 */
+                if (group_leader->ctx != ctx)
+                        goto err_put_context;
+                /*
+                 * Only a group leader can be exclusive or pinned
+                 */
+                if (attr.exclusive || attr.pinned)
+                        goto err_put_context;
+        }
+        counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
+                                     NULL, GFP_KERNEL);
+        ret = PTR_ERR(counter);
+        if (IS_ERR(counter))
+                goto err_put_context;
+        ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+        if (ret < 0)
+                goto err_free_put_context;
+        counter_file = fget_light(ret, &fput_needed2);
+        if (!counter_file)
+                goto err_free_put_context;
+        counter->filp = counter_file;
+        WARN_ON_ONCE(ctx->parent_ctx);
+        mutex_lock(&ctx->mutex);
+        perf_install_in_context(ctx, counter, cpu);
+        ++ctx->generation;
+        mutex_unlock(&ctx->mutex);
+        counter->owner = current;
+        get_task_struct(current);
+        mutex_lock(&current->perf_counter_mutex);
+        list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+        mutex_unlock(&current->perf_counter_mutex);
+        fput_light(counter_file, fput_needed2);
+out_fput:
+        fput_light(group_file, fput_needed);
+        return ret;
+err_free_put_context:
+        kfree(counter);
+err_put_context:
+        put_ctx(ctx);
+        goto out_fput;
+}
+/*
+ * inherit a counter from parent task to child task:
+ */
+static struct perf_counter *
+inherit_counter(struct perf_counter *parent_counter,
+              struct task_struct *parent,
+              struct perf_counter_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_counter *group_leader,
+              struct perf_counter_context *child_ctx)
+{
+        struct perf_counter *child_counter;
+        /*
+         * Instead of creating recursive hierarchies of counters,
+         * we link inherited counters back to the original parent,
+         * which has a filp for sure, which we use as the reference
+         * count:
+         */
+        if (parent_counter->parent)
+                parent_counter = parent_counter->parent;
+        child_counter = perf_counter_alloc(&parent_counter->attr,
+                                           parent_counter->cpu, child_ctx,
+                                           group_leader, parent_counter,
+                                           GFP_KERNEL);
+        if (IS_ERR(child_counter))
+                return child_counter;
+        get_ctx(child_ctx);
+        /*
+         * Make the child state follow the state of the parent counter,
+         * not its attr.disabled bit.  We hold the parent's mutex,
+         * so we won't race with perf_counter_{en, dis}able_family.
+         */
+        if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+                child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+        else
+                child_counter->state = PERF_COUNTER_STATE_OFF;
+        if (parent_counter->attr.freq)
+                child_counter->hw.sample_period = parent_counter->hw.sample_period;
+        /*
+         * Link it up in the child's context:
+         */
+        add_counter_to_ctx(child_counter, child_ctx);
+        /*
+         * Get a reference to the parent filp - we will fput it
+         * when the child counter exits. This is safe to do because
+         * we are in the parent and we know that the filp still
+         * exists and has a nonzero count:
+         */
+        atomic_long_inc(&parent_counter->filp->f_count);
+        /*
+         * Link this into the parent counter's child list
+         */
+        WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+        mutex_lock(&parent_counter->child_mutex);
+        list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+        mutex_unlock(&parent_counter->child_mutex);
+        return child_counter;
+}
+static int inherit_group(struct perf_counter *parent_counter,
+              struct task_struct *parent,
+              struct perf_counter_context *parent_ctx,
+              struct task_struct *child,
+              struct perf_counter_context *child_ctx)
+{
+        struct perf_counter *leader;
+        struct perf_counter *sub;
+        struct perf_counter *child_ctr;
+        leader = inherit_counter(parent_counter, parent, parent_ctx,
+                                 child, NULL, child_ctx);
+        if (IS_ERR(leader))
+                return PTR_ERR(leader);
+        list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+                child_ctr = inherit_counter(sub, parent, parent_ctx,
+                                            child, leader, child_ctx);
+                if (IS_ERR(child_ctr))
+                        return PTR_ERR(child_ctr);
+        }
+        return 0;
+}
+static void sync_child_counter(struct perf_counter *child_counter,
+                               struct task_struct *child)
+{
+        struct perf_counter *parent_counter = child_counter->parent;
+        u64 child_val;
+        if (child_counter->attr.inherit_stat)
+                perf_counter_read_event(child_counter, child);
+        child_val = atomic64_read(&child_counter->count);
+        /*
+         * Add back the child's count to the parent's count:
+         */
+        atomic64_add(child_val, &parent_counter->count);
+        atomic64_add(child_counter->total_time_enabled,
+                     &parent_counter->child_total_time_enabled);
+        atomic64_add(child_counter->total_time_running,
+                     &parent_counter->child_total_time_running);
+        /*
+         * Remove this counter from the parent's list
+         */
+        WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+        mutex_lock(&parent_counter->child_mutex);
+        list_del_init(&child_counter->child_list);
+        mutex_unlock(&parent_counter->child_mutex);
+        /*
+         * Release the parent counter, if this was the last
+         * reference to it.
+         */
+        fput(parent_counter->filp);
+}
+static void
+__perf_counter_exit_task(struct perf_counter *child_counter,
+                         struct perf_counter_context *child_ctx,
+                         struct task_struct *child)
+{
+        struct perf_counter *parent_counter;
+        update_counter_times(child_counter);
+        perf_counter_remove_from_context(child_counter);
+        parent_counter = child_counter->parent;
+        /*
+         * It can happen that parent exits first, and has counters
+         * that are still around due to the child reference. These
+         * counters need to be zapped - but otherwise linger.
+         */
+        if (parent_counter) {
+                sync_child_counter(child_counter, child);
+                free_counter(child_counter);
+        }
+}
+/*
+ * When a child task exits, feed back counter values to parent counters.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+        struct perf_counter *child_counter, *tmp;
+        struct perf_counter_context *child_ctx;
+        unsigned long flags;
+        if (likely(!child->perf_counter_ctxp)) {
+                perf_counter_task(child, NULL, 0);
+                return;
+        }
+        local_irq_save(flags);
+        /*
+         * We can't reschedule here because interrupts are disabled,
+         * and either child is current or it is a task that can't be
+         * scheduled, so we are now safe from rescheduling changing
+         * our context.
+         */
+        child_ctx = child->perf_counter_ctxp;
+        __perf_counter_task_sched_out(child_ctx);
+        /*
+         * Take the context lock here so that if find_get_context is
+         * reading child->perf_counter_ctxp, we wait until it has
+         * incremented the context's refcount before we do put_ctx below.
+         */
+        spin_lock(&child_ctx->lock);
+        child->perf_counter_ctxp = NULL;
+        /*
+         * If this context is a clone; unclone it so it can't get
+         * swapped to another process while we're removing all
+         * the counters from it.
+         */
+        unclone_ctx(child_ctx);
+        spin_unlock_irqrestore(&child_ctx->lock, flags);
+        /*
+         * Report the task dead after unscheduling the counters so that we
+         * won't get any samples after PERF_EVENT_EXIT. We can however still
+         * get a few PERF_EVENT_READ events.
+         */
+        perf_counter_task(child, child_ctx, 0);
+        /*
+         * We can recurse on the same lock type through:
+         *
+         *   __perf_counter_exit_task()
+         *     sync_child_counter()
+         *       fput(parent_counter->filp)
+         *         perf_release()
+         *           mutex_lock(&ctx->mutex)
+         *
+         * But since its the parent context it won't be the same instance.
+         */
+        mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+again:
+        list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+                                 list_entry)
+                __perf_counter_exit_task(child_counter, child_ctx, child);
+        /*
+         * If the last counter was a group counter, it will have appended all
+         * its siblings to the list, but we obtained 'tmp' before that which
+         * will still point to the list head terminating the iteration.
+         */
+        if (!list_empty(&child_ctx->counter_list))
+                goto again;
+        mutex_unlock(&child_ctx->mutex);
+        put_ctx(child_ctx);
+}
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+        struct perf_counter_context *ctx = task->perf_counter_ctxp;
+        struct perf_counter *counter, *tmp;
+        if (!ctx)
+                return;
+        mutex_lock(&ctx->mutex);
+again:
+        list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+                struct perf_counter *parent = counter->parent;
+                if (WARN_ON_ONCE(!parent))
+                        continue;
+                mutex_lock(&parent->child_mutex);
+                list_del_init(&counter->child_list);
+                mutex_unlock(&parent->child_mutex);
+                fput(parent->filp);
+                list_del_counter(counter, ctx);
+                free_counter(counter);
+        }
+        if (!list_empty(&ctx->counter_list))
+                goto again;
+        mutex_unlock(&ctx->mutex);
+        put_ctx(ctx);
+}
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+int perf_counter_init_task(struct task_struct *child)
+{
+        struct perf_counter_context *child_ctx, *parent_ctx;
+        struct perf_counter_context *cloned_ctx;
+        struct perf_counter *counter;
+        struct task_struct *parent = current;
+        int inherited_all = 1;
+        int ret = 0;
+        child->perf_counter_ctxp = NULL;
+        mutex_init(&child->perf_counter_mutex);
+        INIT_LIST_HEAD(&child->perf_counter_list);
+        if (likely(!parent->perf_counter_ctxp))
+                return 0;
+        /*
+         * This is executed from the parent task context, so inherit
+         * counters that have been marked for cloning.
+         * First allocate and initialize a context for the child.
+         */
+        child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+        if (!child_ctx)
+                return -ENOMEM;
+        __perf_counter_init_context(child_ctx, child);
+        child->perf_counter_ctxp = child_ctx;
+        get_task_struct(child);
+        /*
+         * If the parent's context is a clone, pin it so it won't get
+         * swapped under us.
+         */
+        parent_ctx = perf_pin_task_context(parent);
+        /*
+         * No need to check if parent_ctx != NULL here; since we saw
+         * it non-NULL earlier, the only reason for it to become NULL
+         * is if we exit, and since we're currently in the middle of
+         * a fork we can't be exiting at the same time.
+         */
+        /*
+         * Lock the parent list. No need to lock the child - not PID
+         * hashed yet and not running, so nobody can access it.
+         */
+        mutex_lock(&parent_ctx->mutex);
+        /*
+         * We dont have to disable NMIs - we are only looking at
+         * the list, not manipulating it:
+         */
+        list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+                if (counter != counter->group_leader)
+                        continue;
+                if (!counter->attr.inherit) {
+                        inherited_all = 0;
+                        continue;
+                }
+                ret = inherit_group(counter, parent, parent_ctx,
+                                             child, child_ctx);
+                if (ret) {
+                        inherited_all = 0;
+                        break;
+                }
+        }
+        if (inherited_all) {
+                /*
+                 * Mark the child context as a clone of the parent
+                 * context, or of whatever the parent is a clone of.
+                 * Note that if the parent is a clone, it could get
+                 * uncloned at any point, but that doesn't matter
+                 * because the list of counters and the generation
+                 * count can't have changed since we took the mutex.
+                 */
+                cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+                if (cloned_ctx) {
+                        child_ctx->parent_ctx = cloned_ctx;
+                        child_ctx->parent_gen = parent_ctx->parent_gen;
+                } else {
+                        child_ctx->parent_ctx = parent_ctx;
+                        child_ctx->parent_gen = parent_ctx->generation;
+                }
+                get_ctx(child_ctx->parent_ctx);
+        }
+        mutex_unlock(&parent_ctx->mutex);
+        perf_unpin_context(parent_ctx);
+        return ret;
+}
+static void __cpuinit perf_counter_init_cpu(int cpu)
+{
+        struct perf_cpu_context *cpuctx;
+        cpuctx = &per_cpu(perf_cpu_context, cpu);
+        __perf_counter_init_context(&cpuctx->ctx, NULL);
+        spin_lock(&perf_resource_lock);
+        cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+        spin_unlock(&perf_resource_lock);
+        hw_perf_counter_setup(cpu);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_counter_exit_cpu(void *info)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_counter_context *ctx = &cpuctx->ctx;
+        struct perf_counter *counter, *tmp;
+        list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+                __perf_counter_remove_from_context(counter);
+}
+static void perf_counter_exit_cpu(int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_counter_context *ctx = &cpuctx->ctx;
+        mutex_lock(&ctx->mutex);
+        smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+        mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_counter_exit_cpu(int cpu) { }
+#endif
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (long)hcpu;
+        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                perf_counter_init_cpu(cpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                hw_perf_counter_setup_online(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                perf_counter_exit_cpu(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+        .notifier_call          = perf_cpu_notify,
+        .priority               = 20,
+};
+void __init perf_counter_init(void)
+{
+        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+                        (void *)(long)smp_processor_id());
+        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+                        (void *)(long)smp_processor_id());
+        register_cpu_notifier(&perf_cpu_nb);
+}
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+        return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+                        const char *buf,
+                        size_t count)
+{
+        struct perf_cpu_context *cpuctx;
+        unsigned long val;
+        int err, cpu, mpt;
+        err = strict_strtoul(buf, 10, &val);
+        if (err)
+                return err;
+        if (val > perf_max_counters)
+                return -EINVAL;
+        spin_lock(&perf_resource_lock);
+        perf_reserved_percpu = val;
+        for_each_online_cpu(cpu) {
+                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                spin_lock_irq(&cpuctx->ctx.lock);
+                mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+                          perf_max_counters - perf_reserved_percpu);
+                cpuctx->max_pertask = mpt;
+                spin_unlock_irq(&cpuctx->ctx.lock);
+        }
+        spin_unlock(&perf_resource_lock);
+        return count;
+}
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+        return sprintf(buf, "%d\n", perf_overcommit);
+}
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+        unsigned long val;
+        int err;
+        err = strict_strtoul(buf, 10, &val);
+        if (err)
+                return err;
+        if (val > 1)
+                return -EINVAL;
+        spin_lock(&perf_resource_lock);
+        perf_overcommit = val;
+        spin_unlock(&perf_resource_lock);
+        return count;
+}
+static SYSDEV_CLASS_ATTR(
+                                reserve_percpu,
+                                0644,
+                                perf_show_reserve_percpu,
+                                perf_set_reserve_percpu
+                        );
+static SYSDEV_CLASS_ATTR(
+                                overcommit,
+                                0644,
+                                perf_show_overcommit,
+                                perf_set_overcommit
+                        );
+static struct attribute *perfclass_attrs[] = {
+        &attr_reserve_percpu.attr,
+        &attr_overcommit.attr,
+        NULL
+};
+static struct attribute_group perfclass_attr_group = {
+        .attrs                  = perfclass_attrs,
+        .name                   = "perf_counters",
+};
+static int __init perf_counter_sysfs_init(void)
+{
+        return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+                                  &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd281..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
 /*
 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
 */
-struct task_struct *find_task_by_pid_type_ns(int type, int nr,
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
-                struct pid_namespace *ns)
 {
-        return pid_task(find_pid_ns(nr, ns), type);
+        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
-EXPORT_SYMBOL(find_task_by_pid_type_ns);
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-        return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
+        return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
-                        current->nsproxy->pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_vpid);
-struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
-{
-        return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
 }
-EXPORT_SYMBOL(find_task_by_pid_ns);
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858d..821722ae58a7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
        return NULL;
 }
-static struct pid_namespace *create_pid_namespace(unsigned int level)
+static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
 {
        struct pid_namespace *ns;
+        unsigned int level = parent_pid_ns->level + 1;
        int i;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
        kref_init(&ns->kref);
        ns->level = level;
+        ns->parent = get_pid_ns(parent_pid_ns);
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 {
-        struct pid_namespace *new_ns;
-        BUG_ON(!old_ns);
-        new_ns = get_pid_ns(old_ns);
        if (!(flags & CLONE_NEWPID))
-                goto out;
+                return get_pid_ns(old_ns);
-        new_ns = ERR_PTR(-EINVAL);
        if (flags & CLONE_THREAD)
-                goto out_put;
+                return ERR_PTR(-EINVAL);
+        return create_pid_namespace(old_ns);
-        new_ns = create_pid_namespace(old_ns->level + 1);
-        if (!IS_ERR(new_ns))
-                new_ns->parent = get_pid_ns(old_ns);
-out_put:
-        put_pid_ns(old_ns);
-out:
-        return new_ns;
 }
 void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-        struct task_cputime cputime;
+        struct signal_struct *const sig = tsk->signal;
-        thread_group_cputimer(tsk, &cputime);
        cleanup_timers(tsk->signal->cpu_timers,
-                       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
+                       cputime_add(tsk->utime, sig->utime),
+                       cputime_add(tsk->stime, sig->stime),
+                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
        return -EOPNOTSUPP;
 }
+static int no_nsleep(const clockid_t which_clock, int flags,
+                     struct timespec *tsave, struct timespec __user *rmtp)
+{
+        return -EOPNOTSUPP;
+}
 /*
 * Return nonzero if we know a priori this clockid_t value is bogus.
 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
                .clock_get = posix_get_monotonic_raw,
                .clock_set = do_posix_clock_nosettime,
                .timer_create = no_timer_create,
+                .nsleep = no_nsleep,
        };
        register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96b..72067cbdb37f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATION_NVS
+        bool
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+        select HIBERNATION_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 720ea4f781bd..c3b81c30e5d5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -6,6 +6,9 @@ endif
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_PM_SLEEP)          += console.o
 obj-$(CONFIG_FREEZER)           += process.o
-obj-$(CONFIG_HIBERNATION)       += swsusp.o disk.o snapshot.o swap.o user.o
+obj-$(CONFIG_SUSPEND)           += suspend.o
+obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
+obj-$(CONFIG_HIBERNATION)       += swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION_NVS)   += hibernate_nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c
index 5cb080e7eebd..81d2e7464893 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/hibernate.c
@@ -1,12 +1,12 @@
 /*
- * kernel/power/disk.c - Suspend-to-disk support.
+ * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
 *
 * This file is released under the GPLv2.
- *
 */
 #include <linux/suspend.h>
@@ -215,13 +215,13 @@ static int create_image(int platform_mode)
        if (error)
                return error;
-        /* At this point, device_suspend() has been called, but *not*
+        /* At this point, dpm_suspend_start() has been called, but *not*
-         * device_power_down(). We *must* call device_power_down() now.
+         * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
         * Otherwise, drivers for some devices (e.g. interrupt controllers)
         * become desynchronized with the actual state of the hardware
         * at resume time, and evil weirdness ensues.
         */
-        error = device_power_down(PMSG_FREEZE);
+        error = dpm_suspend_noirq(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
 Power_up:
        sysdev_resume();
-        /* NOTE:  device_power_up() is just a resume() for devices
+        /* NOTE:  dpm_resume_noirq() is just a resume() for devices
         * that suspended with irqs off ... no overall powerup.
         */
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
 Platform_finish:
        platform_finish(platform_mode);
-        device_power_up(in_suspend ?
+        dpm_resume_noirq(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
        return error;
@@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
-        error = device_suspend(PMSG_FREEZE);
+        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
        /* Control returns here after successful restore */
 Resume_devices:
-        device_resume(in_suspend ?
+        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
        resume_console();
 Close:
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
 {
        int error;
-        error = device_power_down(PMSG_QUIESCE);
+        error = dpm_suspend_noirq(PMSG_QUIESCE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
 Cleanup:
        platform_restore_cleanup(platform_mode);
-        device_power_up(PMSG_RECOVER);
+        dpm_resume_noirq(PMSG_RECOVER);
        return error;
 }
@@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode)
        pm_prepare_console();
        suspend_console();
-        error = device_suspend(PMSG_QUIESCE);
+        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
-                device_resume(PMSG_RECOVER);
+                dpm_resume_end(PMSG_RECOVER);
        }
        resume_console();
        pm_restore_console();
@@ -447,14 +447,14 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        error = device_suspend(PMSG_HIBERNATE);
+        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
                        hibernation_ops->recover();
                goto Resume_devices;
        }
-        error = device_power_down(PMSG_HIBERNATE);
+        error = dpm_suspend_noirq(PMSG_HIBERNATE);
        if (error)
                goto Resume_devices;
@@ -479,11 +479,11 @@ int hibernation_platform_enter(void)
 Platofrm_finish:
        hibernation_ops->finish();
-        device_power_up(PMSG_RESTORE);
+        dpm_suspend_noirq(PMSG_RESTORE);
 Resume_devices:
        entering_platform_hibernation = false;
-        device_resume(PMSG_RESTORE);
+        dpm_resume_end(PMSG_RESTORE);
        resume_console();
 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 000000000000..39ac698ef836
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+struct nvs_page {
+        unsigned long phys_start;
+        unsigned int size;
+        void *kaddr;
+        void *data;
+        struct list_head node;
+};
+static LIST_HEAD(nvs_list);
+/**
+ *      hibernate_nvs_register - register platform NVS memory region to save
+ *      @start - physical address of the region
+ *      @size - size of the region
+ *
+ *      The NVS region need not be page-aligned (both ends) and we arrange
+ *      things so that the data from page-aligned addresses in this region will
+ *      be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+        struct nvs_page *entry, *next;
+        while (size > 0) {
+                unsigned int nr_bytes;
+                entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+                if (!entry)
+                        goto Error;
+                list_add_tail(&entry->node, &nvs_list);
+                entry->phys_start = start;
+                nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+                entry->size = (size < nr_bytes) ? size : nr_bytes;
+                start += entry->size;
+                size -= entry->size;
+        }
+        return 0;
+ Error:
+        list_for_each_entry_safe(entry, next, &nvs_list, node) {
+                list_del(&entry->node);
+                kfree(entry);
+        }
+        return -ENOMEM;
+}
+/**
+ *      hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+        struct nvs_page *entry;
+        list_for_each_entry(entry, &nvs_list, node)
+                if (entry->data) {
+                        free_page((unsigned long)entry->data);
+                        entry->data = NULL;
+                        if (entry->kaddr) {
+                                iounmap(entry->kaddr);
+                                entry->kaddr = NULL;
+                        }
+                }
+}
+/**
+ *      hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+        struct nvs_page *entry;
+        list_for_each_entry(entry, &nvs_list, node) {
+                entry->data = (void *)__get_free_page(GFP_KERNEL);
+                if (!entry->data) {
+                        hibernate_nvs_free();
+                        return -ENOMEM;
+                }
+        }
+        return 0;
+}
+/**
+ *      hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+        struct nvs_page *entry;
+        printk(KERN_INFO "PM: Saving platform NVS memory\n");
+        list_for_each_entry(entry, &nvs_list, node)
+                if (entry->data) {
+                        entry->kaddr = ioremap(entry->phys_start, entry->size);
+                        memcpy(entry->data, entry->kaddr, entry->size);
+                }
+}
+/**
+ *      hibernate_nvs_restore - restore NVS memory regions
+ *
+ *      This function is going to be called with interrupts disabled, so it
+ *      cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+        struct nvs_page *entry;
+        printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+        list_for_each_entry(entry, &nvs_list, node)
+                if (entry->data)
+                        memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 868028280d13..f710e36930cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
 *
 */
-#include <linux/module.h>
-#include <linux/suspend.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/kmod.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/cpu.h>
 #include <linux/resume-trace.h>
-#include <linux/freezer.h>
-#include <linux/vmstat.h>
-#include <linux/syscalls.h>
 #include "power.h"
@@ -119,373 +108,6 @@ power_attr(pm_test);
 #endif /* CONFIG_PM_SLEEP */
-#ifdef CONFIG_SUSPEND
-static int suspend_test(int level)
-{
-#ifdef CONFIG_PM_DEBUG
-        if (pm_test_level == level) {
-                printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
-                mdelay(5000);
-                return 1;
-        }
-#endif /* !CONFIG_PM_DEBUG */
-        return 0;
-}
-#ifdef CONFIG_PM_TEST_SUSPEND
-/*
- * We test the system suspend code by setting an RTC wakealarm a short
- * time in the future, then suspending.  Suspending the devices won't
- * normally take long ... some systems only need a few milliseconds.
- *
- * The time it takes is system-specific though, so when we test this
- * during system bootup we allow a LOT of time.
- */
-#define TEST_SUSPEND_SECONDS    5
-static unsigned long suspend_test_start_time;
-static void suspend_test_start(void)
-{
-        /* FIXME Use better timebase than "jiffies", ideally a clocksource.
-         * What we want is a hardware counter that will work correctly even
-         * during the irqs-are-off stages of the suspend/resume cycle...
-         */
-        suspend_test_start_time = jiffies;
-}
-static void suspend_test_finish(const char *label)
-{
-        long nj = jiffies - suspend_test_start_time;
-        unsigned msec;
-        msec = jiffies_to_msecs(abs(nj));
-        pr_info("PM: %s took %d.%03d seconds\n", label,
-                        msec / 1000, msec % 1000);
-        /* Warning on suspend means the RTC alarm period needs to be
-         * larger -- the system was sooo slooowwww to suspend that the
-         * alarm (should have) fired before the system went to sleep!
-         *
-         * Warning on either suspend or resume also means the system
-         * has some performance issues.  The stack dump of a WARN_ON
-         * is more likely to get the right attention than a printk...
-         */
-        WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
-}
-#else
-static void suspend_test_start(void)
-{
-}
-static void suspend_test_finish(const char *label)
-{
-}
-#endif
-/* This is just an arbitrary number */
-#define FREE_PAGE_NUMBER (100)
-static struct platform_suspend_ops *suspend_ops;
-/**
- *      suspend_set_ops - Set the global suspend method table.
- *      @ops:   Pointer to ops structure.
- */
-void suspend_set_ops(struct platform_suspend_ops *ops)
-{
-        mutex_lock(&pm_mutex);
-        suspend_ops = ops;
-        mutex_unlock(&pm_mutex);
-}
-/**
- * suspend_valid_only_mem - generic memory-only valid callback
- *
- * Platform drivers that implement mem suspend only and only need
- * to check for that in their .valid callback can use this instead
- * of rolling their own .valid callback.
- */
-int suspend_valid_only_mem(suspend_state_t state)
-{
-        return state == PM_SUSPEND_MEM;
-}
-/**
- *      suspend_prepare - Do prep work before entering low-power state.
- *
- *      This is common code that is called for each state that we're entering.
- *      Run suspend notifiers, allocate a console and stop all processes.
- */
-static int suspend_prepare(void)
-{
-        int error;
-        unsigned int free_pages;
-        if (!suspend_ops || !suspend_ops->enter)
-                return -EPERM;
-        pm_prepare_console();
-        error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-        if (error)
-                goto Finish;
-        error = usermodehelper_disable();
-        if (error)
-                goto Finish;
-        if (suspend_freeze_processes()) {
-                error = -EAGAIN;
-                goto Thaw;
-        }
-        free_pages = global_page_state(NR_FREE_PAGES);
-        if (free_pages < FREE_PAGE_NUMBER) {
-                pr_debug("PM: free some memory\n");
-                shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
-                if (nr_free_pages() < FREE_PAGE_NUMBER) {
-                        error = -ENOMEM;
-                        printk(KERN_ERR "PM: No enough memory\n");
-                }
-        }
-        if (!error)
-                return 0;
- Thaw:
-        suspend_thaw_processes();
-        usermodehelper_enable();
- Finish:
-        pm_notifier_call_chain(PM_POST_SUSPEND);
-        pm_restore_console();
-        return error;
-}
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
-{
-        local_irq_disable();
-}
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
-{
-        local_irq_enable();
-}
-/**
- *      suspend_enter - enter the desired system sleep state.
- *      @state:         state to enter
- *
- *      This function should be called after devices have been suspended.
- */
-static int suspend_enter(suspend_state_t state)
-{
-        int error;
-        if (suspend_ops->prepare) {
-                error = suspend_ops->prepare();
-                if (error)
-                        return error;
-        }
-        error = device_power_down(PMSG_SUSPEND);
-        if (error) {
-                printk(KERN_ERR "PM: Some devices failed to power down\n");
-                goto Platfrom_finish;
-        }
-        if (suspend_ops->prepare_late) {
-                error = suspend_ops->prepare_late();
-                if (error)
-                        goto Power_up_devices;
-        }
-        if (suspend_test(TEST_PLATFORM))
-                goto Platform_wake;
-        error = disable_nonboot_cpus();
-        if (error || suspend_test(TEST_CPUS))
-                goto Enable_cpus;
-        arch_suspend_disable_irqs();
-        BUG_ON(!irqs_disabled());
-        error = sysdev_suspend(PMSG_SUSPEND);
-        if (!error) {
-                if (!suspend_test(TEST_CORE))
-                        error = suspend_ops->enter(state);
-                sysdev_resume();
-        }
-        arch_suspend_enable_irqs();
-        BUG_ON(irqs_disabled());
- Enable_cpus:
-        enable_nonboot_cpus();
- Platform_wake:
-        if (suspend_ops->wake)
-                suspend_ops->wake();
- Power_up_devices:
-        device_power_up(PMSG_RESUME);
- Platfrom_finish:
-        if (suspend_ops->finish)
-                suspend_ops->finish();
-        return error;
-}
-/**
- *      suspend_devices_and_enter - suspend devices and enter the desired system
- *                                  sleep state.
- *      @state:           state to enter
- */
-int suspend_devices_and_enter(suspend_state_t state)
-{
-        int error;
-        if (!suspend_ops)
-                return -ENOSYS;
-        if (suspend_ops->begin) {
-                error = suspend_ops->begin(state);
-                if (error)
-                        goto Close;
-        }
-        suspend_console();
-        suspend_test_start();
-        error = device_suspend(PMSG_SUSPEND);
-        if (error) {
-                printk(KERN_ERR "PM: Some devices failed to suspend\n");
-                goto Recover_platform;
-        }
-        suspend_test_finish("suspend devices");
-        if (suspend_test(TEST_DEVICES))
-                goto Recover_platform;
-        suspend_enter(state);
- Resume_devices:
-        suspend_test_start();
-        device_resume(PMSG_RESUME);
-        suspend_test_finish("resume devices");
-        resume_console();
- Close:
-        if (suspend_ops->end)
-                suspend_ops->end();
-        return error;
- Recover_platform:
-        if (suspend_ops->recover)
-                suspend_ops->recover();
-        goto Resume_devices;
-}
-/**
- *      suspend_finish - Do final work before exiting suspend sequence.
- *
- *      Call platform code to clean up, restart processes, and free the 
- *      console that we've allocated. This is not called for suspend-to-disk.
- */
-static void suspend_finish(void)
-{
-        suspend_thaw_processes();
-        usermodehelper_enable();
-        pm_notifier_call_chain(PM_POST_SUSPEND);
-        pm_restore_console();
-}
-static const char * const pm_states[PM_SUSPEND_MAX] = {
-        [PM_SUSPEND_STANDBY]    = "standby",
-        [PM_SUSPEND_MEM]        = "mem",
-};
-static inline int valid_state(suspend_state_t state)
-{
-        /* All states need lowlevel support and need to be valid
-         * to the lowlevel implementation, no valid callback
-         * implies that none are valid. */
-        if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
-                return 0;
-        return 1;
-}
-/**
- *      enter_state - Do common work of entering low-power state.
- *      @state:         pm_state structure for state we're entering.
- *
- *      Make sure we're the only ones trying to enter a sleep state. Fail
- *      if someone has beat us to it, since we don't want anything weird to
- *      happen when we wake up.
- *      Then, do the setup for suspend, enter the state, and cleaup (after
- *      we've woken up).
- */
-static int enter_state(suspend_state_t state)
-{
-        int error;
-        if (!valid_state(state))
-                return -ENODEV;
-        if (!mutex_trylock(&pm_mutex))
-                return -EBUSY;
-        printk(KERN_INFO "PM: Syncing filesystems ... ");
-        sys_sync();
-        printk("done.\n");
-        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
-        error = suspend_prepare();
-        if (error)
-                goto Unlock;
-        if (suspend_test(TEST_FREEZER))
-                goto Finish;
-        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
-        error = suspend_devices_and_enter(state);
- Finish:
-        pr_debug("PM: Finishing wakeup.\n");
-        suspend_finish();
- Unlock:
-        mutex_unlock(&pm_mutex);
-        return error;
-}
-/**
- *      pm_suspend - Externally visible function for suspending system.
- *      @state:         Enumerated value of state to enter.
- *
- *      Determine whether or not value is within range, get state 
- *      structure, and enter (above).
- */
-int pm_suspend(suspend_state_t state)
-{
-        if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
-                return enter_state(state);
-        return -EINVAL;
-}
-EXPORT_SYMBOL(pm_suspend);
-#endif /* CONFIG_SUSPEND */
 struct kobject *power_kobj;
 /**
@@ -498,7 +120,6 @@ struct kobject *power_kobj;
 *      store() accepts one of those strings, translates it into the 
 *      proper enumerated value, and initiates a suspend transition.
 */
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
                          char *buf)
 {
@@ -596,7 +217,6 @@ static struct attribute_group attr_group = {
        .attrs = g,
 };
 static int __init pm_init(void)
 {
        power_kobj = kobject_create_and_add("power", NULL);
@@ -606,144 +226,3 @@ static int __init pm_init(void)
 }
 core_initcall(pm_init);
-#ifdef CONFIG_PM_TEST_SUSPEND
-#include <linux/rtc.h>
-/*
- * To test system suspend, we need a hands-off mechanism to resume the
- * system.  RTCs wake alarms are a common self-contained mechanism.
- */
-static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
-{
-        static char err_readtime[] __initdata =
-                KERN_ERR "PM: can't read %s time, err %d\n";
-        static char err_wakealarm [] __initdata =
-                KERN_ERR "PM: can't set %s wakealarm, err %d\n";
-        static char err_suspend[] __initdata =
-                KERN_ERR "PM: suspend test failed, error %d\n";
-        static char info_test[] __initdata =
-                KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
-        unsigned long           now;
-        struct rtc_wkalrm       alm;
-        int                     status;
-        /* this may fail if the RTC hasn't been initialized */
-        status = rtc_read_time(rtc, &alm.time);
-        if (status < 0) {
-                printk(err_readtime, dev_name(&rtc->dev), status);
-                return;
-        }
-        rtc_tm_to_time(&alm.time, &now);
-        memset(&alm, 0, sizeof alm);
-        rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
-        alm.enabled = true;
-        status = rtc_set_alarm(rtc, &alm);
-        if (status < 0) {
-                printk(err_wakealarm, dev_name(&rtc->dev), status);
-                return;
-        }
-        if (state == PM_SUSPEND_MEM) {
-                printk(info_test, pm_states[state]);
-                status = pm_suspend(state);
-                if (status == -ENODEV)
-                        state = PM_SUSPEND_STANDBY;
-        }
-        if (state == PM_SUSPEND_STANDBY) {
-                printk(info_test, pm_states[state]);
-                status = pm_suspend(state);
-        }
-        if (status < 0)
-                printk(err_suspend, status);
-        /* Some platforms can't detect that the alarm triggered the
-         * wakeup, or (accordingly) disable it after it afterwards.
-         * It's supposed to give oneshot behavior; cope.
-         */
-        alm.enabled = false;
-        rtc_set_alarm(rtc, &alm);
-}
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
-        struct rtc_device *candidate = to_rtc_device(dev);
-        if (!candidate->ops->set_alarm)
-                return 0;
-        if (!device_may_wakeup(candidate->dev.parent))
-                return 0;
-        *(const char **)name_ptr = dev_name(dev);
-        return 1;
-}
-/*
- * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
- * at startup time.  They're normally disabled, for faster boot and because
- * we can't know which states really work on this particular system.
- */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
-static char warn_bad_state[] __initdata =
-        KERN_WARNING "PM: can't test '%s' suspend state\n";
-static int __init setup_test_suspend(char *value)
-{
-        unsigned i;
-        /* "=mem" ==> "mem" */
-        value++;
-        for (i = 0; i < PM_SUSPEND_MAX; i++) {
-                if (!pm_states[i])
-                        continue;
-                if (strcmp(pm_states[i], value) != 0)
-                        continue;
-                test_state = (__force suspend_state_t) i;
-                return 0;
-        }
-        printk(warn_bad_state, value);
-        return 0;
-}
-__setup("test_suspend", setup_test_suspend);
-static int __init test_suspend(void)
-{
-        static char             warn_no_rtc[] __initdata =
-                KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
-        char                    *pony = NULL;
-        struct rtc_device       *rtc = NULL;
-        /* PM is initialized by now; is that state testable? */
-        if (test_state == PM_SUSPEND_ON)
-                goto done;
-        if (!valid_state(test_state)) {
-                printk(warn_bad_state, pm_states[test_state]);
-                goto done;
-        }
-        /* RTCs have initialized by now too ... can we use one? */
-        class_find_device(rtc_class, NULL, &pony, has_wakealarm);
-        if (pony)
-                rtc = rtc_class_open(pony);
-        if (!rtc) {
-                printk(warn_no_rtc);
-                goto done;
-        }
-        /* go for it */
-        test_wakealarm(rtc, test_state);
-        rtc_class_close(rtc);
-done:
-        return 0;
-}
-late_initcall(test_suspend);
-#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3afb..26d5a26f82e3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 */
 #define SPARE_PAGES     ((1024 * 1024) >> PAGE_SHIFT)
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
 extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
-extern unsigned int count_data_pages(void);
+extern int swsusp_shrink_memory(void);
 /**
 *      Auxiliary structure used for reading the snapshot image data and
@@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);
 */
 #define SF_PLATFORM_MODE        1
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
 extern int swsusp_check(void);
-extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
@@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
                                unsigned int, char *);
 #ifdef CONFIG_SUSPEND
-/* kernel/power/main.c */
+/* kernel/power/suspend.c */
+extern const char *const pm_states[];
+extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
+extern int enter_state(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
        return -ENOSYS;
 }
+static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
+static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
+#ifdef CONFIG_PM_TEST_SUSPEND
+/* kernel/power/suspend_test.c */
+extern void suspend_test_start(void);
+extern void suspend_test_finish(const char *label);
+#else /* !CONFIG_PM_TEST_SUSPEND */
+static inline void suspend_test_start(void) {}
+static inline void suspend_test_finish(const char *label) {}
+#endif /* !CONFIG_PM_TEST_SUSPEND */
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
 extern int pm_notifier_call_chain(unsigned long val);
 #endif
 #ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
 int restore_highmem(void);
 #else
 static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b5..e8b337006276 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {
        .handler        = handle_poweroff,
        .help_msg       = "powerOff",
        .action_msg     = "Power Off",
-        .enable_mask    = SYSRQ_ENABLE_BOOT,
+        .enable_mask    = SYSRQ_ENABLE_BOOT,
 };
 static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
        if (error)
                goto Exit;
        printk("done.");
+        oom_killer_disable();
 Exit:
        BUG_ON(in_atomic());
        printk("\n");
        return error;
 }
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
 void thaw_processes(void)
 {
+        oom_killer_enable();
        printk("Restarting tasks ... ");
        thaw_tasks(true);
        thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a819f9..523a451b45d3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
 /* List of PBEs needed for restoring the pages that were allocated before
 * the suspend and included in the suspend image, but have also been
 * allocated by the "resume" kernel, so their contents cannot be written
@@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 *      pages.
 */
-unsigned int count_highmem_pages(void)
+static unsigned int count_highmem_pages(void)
 {
        struct zone *zone;
        unsigned int n = 0;
@@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 *      pages.
 */
-unsigned int count_data_pages(void)
+static unsigned int count_data_pages(void)
 {
        struct zone *zone;
        unsigned long pfn, max_zone_pfn;
@@ -1058,6 +1066,74 @@ void swsusp_free(void)
        buffer = NULL;
 }
+/**
+ *      swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *      ... but do not OOM-kill anyone
+ *
+ *      Notice: all userland should be stopped before it is called, or
+ *      livelock is possible.
+ */
+#define SHRINK_BITE     10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+        if (tmp > SHRINK_BITE)
+                tmp = SHRINK_BITE;
+        return shrink_all_memory(tmp);
+}
+int swsusp_shrink_memory(void)
+{
+        long tmp;
+        struct zone *zone;
+        unsigned long pages = 0;
+        unsigned int i = 0;
+        char *p = "-\\|/";
+        struct timeval start, stop;
+        printk(KERN_INFO "PM: Shrinking memory...  ");
+        do_gettimeofday(&start);
+        do {
+                long size, highmem_size;
+                highmem_size = count_highmem_pages();
+                size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
+                tmp = size;
+                size += highmem_size;
+                for_each_populated_zone(zone) {
+                        tmp += snapshot_additional_pages(zone);
+                        if (is_highmem(zone)) {
+                                highmem_size -=
+                                        zone_page_state(zone, NR_FREE_PAGES);
+                        } else {
+                                tmp -= zone_page_state(zone, NR_FREE_PAGES);
+                                tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                        }
+                }
+                if (highmem_size < 0)
+                        highmem_size = 0;
+                tmp += highmem_size;
+                if (tmp > 0) {
+                        tmp = __shrink_memory(tmp);
+                        if (!tmp)
+                                return -ENOMEM;
+                        pages += tmp;
+                } else if (size > image_size / PAGE_SIZE) {
+                        tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+                        pages += tmp;
+                }
+                printk("\b%c", p[i++%4]);
+        } while (tmp > 0);
+        do_gettimeofday(&stop);
+        printk("\bdone (%lu pages freed)\n", pages);
+        swsusp_show_speed(&start, &stop, pages, "Freed");
+        return 0;
+}
 #ifdef CONFIG_HIGHMEM
 /**
  *     count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 000000000000..6f10dfc2d3e9
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
+/*
+ * kernel/power/suspend.c - Suspend to RAM and standby functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+#include "power.h"
+const char *const pm_states[PM_SUSPEND_MAX] = {
+        [PM_SUSPEND_STANDBY]    = "standby",
+        [PM_SUSPEND_MEM]        = "mem",
+};
+static struct platform_suspend_ops *suspend_ops;
+/**
+ *      suspend_set_ops - Set the global suspend method table.
+ *      @ops:   Pointer to ops structure.
+ */
+void suspend_set_ops(struct platform_suspend_ops *ops)
+{
+        mutex_lock(&pm_mutex);
+        suspend_ops = ops;
+        mutex_unlock(&pm_mutex);
+}
+bool valid_state(suspend_state_t state)
+{
+        /*
+         * All states need lowlevel support and need to be valid to the lowlevel
+         * implementation, no valid callback implies that none are valid.
+         */
+        return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+/**
+ * suspend_valid_only_mem - generic memory-only valid callback
+ *
+ * Platform drivers that implement mem suspend only and only need
+ * to check for that in their .valid callback can use this instead
+ * of rolling their own .valid callback.
+ */
+int suspend_valid_only_mem(suspend_state_t state)
+{
+        return state == PM_SUSPEND_MEM;
+}
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+        if (pm_test_level == level) {
+                printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+                mdelay(5000);
+                return 1;
+        }
+#endif /* !CONFIG_PM_DEBUG */
+        return 0;
+}
+/**
+ *      suspend_prepare - Do prep work before entering low-power state.
+ *
+ *      This is common code that is called for each state that we're entering.
+ *      Run suspend notifiers, allocate a console and stop all processes.
+ */
+static int suspend_prepare(void)
+{
+        int error;
+        if (!suspend_ops || !suspend_ops->enter)
+                return -EPERM;
+        pm_prepare_console();
+        error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+        if (error)
+                goto Finish;
+        error = usermodehelper_disable();
+        if (error)
+                goto Finish;
+        error = suspend_freeze_processes();
+        if (!error)
+                return 0;
+        suspend_thaw_processes();
+        usermodehelper_enable();
+ Finish:
+        pm_notifier_call_chain(PM_POST_SUSPEND);
+        pm_restore_console();
+        return error;
+}
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+{
+        local_irq_disable();
+}
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+{
+        local_irq_enable();
+}
+/**
+ *      suspend_enter - enter the desired system sleep state.
+ *      @state:         state to enter
+ *
+ *      This function should be called after devices have been suspended.
+ */
+static int suspend_enter(suspend_state_t state)
+{
+        int error;
+        if (suspend_ops->prepare) {
+                error = suspend_ops->prepare();
+                if (error)
+                        return error;
+        }
+        error = dpm_suspend_noirq(PMSG_SUSPEND);
+        if (error) {
+                printk(KERN_ERR "PM: Some devices failed to power down\n");
+                goto Platfrom_finish;
+        }
+        if (suspend_ops->prepare_late) {
+                error = suspend_ops->prepare_late();
+                if (error)
+                        goto Power_up_devices;
+        }
+        if (suspend_test(TEST_PLATFORM))
+                goto Platform_wake;
+        error = disable_nonboot_cpus();
+        if (error || suspend_test(TEST_CPUS))
+                goto Enable_cpus;
+        arch_suspend_disable_irqs();
+        BUG_ON(!irqs_disabled());
+        error = sysdev_suspend(PMSG_SUSPEND);
+        if (!error) {
+                if (!suspend_test(TEST_CORE))
+                        error = suspend_ops->enter(state);
+                sysdev_resume();
+        }
+        arch_suspend_enable_irqs();
+        BUG_ON(irqs_disabled());
+ Enable_cpus:
+        enable_nonboot_cpus();
+ Platform_wake:
+        if (suspend_ops->wake)
+                suspend_ops->wake();
+ Power_up_devices:
+        dpm_resume_noirq(PMSG_RESUME);
+ Platfrom_finish:
+        if (suspend_ops->finish)
+                suspend_ops->finish();
+        return error;
+}
+/**
+ *      suspend_devices_and_enter - suspend devices and enter the desired system
+ *                                  sleep state.
+ *      @state:           state to enter
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+        int error;
+        if (!suspend_ops)
+                return -ENOSYS;
+        if (suspend_ops->begin) {
+                error = suspend_ops->begin(state);
+                if (error)
+                        goto Close;
+        }
+        suspend_console();
+        suspend_test_start();
+        error = dpm_suspend_start(PMSG_SUSPEND);
+        if (error) {
+                printk(KERN_ERR "PM: Some devices failed to suspend\n");
+                goto Recover_platform;
+        }
+        suspend_test_finish("suspend devices");
+        if (suspend_test(TEST_DEVICES))
+                goto Recover_platform;
+        suspend_enter(state);
+ Resume_devices:
+        suspend_test_start();
+        dpm_resume_end(PMSG_RESUME);
+        suspend_test_finish("resume devices");
+        resume_console();
+ Close:
+        if (suspend_ops->end)
+                suspend_ops->end();
+        return error;
+ Recover_platform:
+        if (suspend_ops->recover)
+                suspend_ops->recover();
+        goto Resume_devices;
+}
+/**
+ *      suspend_finish - Do final work before exiting suspend sequence.
+ *
+ *      Call platform code to clean up, restart processes, and free the
+ *      console that we've allocated. This is not called for suspend-to-disk.
+ */
+static void suspend_finish(void)
+{
+        suspend_thaw_processes();
+        usermodehelper_enable();
+        pm_notifier_call_chain(PM_POST_SUSPEND);
+        pm_restore_console();
+}
+/**
+ *      enter_state - Do common work of entering low-power state.
+ *      @state:         pm_state structure for state we're entering.
+ *
+ *      Make sure we're the only ones trying to enter a sleep state. Fail
+ *      if someone has beat us to it, since we don't want anything weird to
+ *      happen when we wake up.
+ *      Then, do the setup for suspend, enter the state, and cleaup (after
+ *      we've woken up).
+ */
+int enter_state(suspend_state_t state)
+{
+        int error;
+        if (!valid_state(state))
+                return -ENODEV;
+        if (!mutex_trylock(&pm_mutex))
+                return -EBUSY;
+        printk(KERN_INFO "PM: Syncing filesystems ... ");
+        sys_sync();
+        printk("done.\n");
+        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+        error = suspend_prepare();
+        if (error)
+                goto Unlock;
+        if (suspend_test(TEST_FREEZER))
+                goto Finish;
+        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+        error = suspend_devices_and_enter(state);
+ Finish:
+        pr_debug("PM: Finishing wakeup.\n");
+        suspend_finish();
+ Unlock:
+        mutex_unlock(&pm_mutex);
+        return error;
+}
+/**
+ *      pm_suspend - Externally visible function for suspending system.
+ *      @state:         Enumerated value of state to enter.
+ *
+ *      Determine whether or not value is within range, get state
+ *      structure, and enter (above).
+ */
+int pm_suspend(suspend_state_t state)
+{
+        if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+                return enter_state(state);
+        return -EINVAL;
+}
+EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 000000000000..17d8bb1acf9c
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
+/*
+ * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
+ *
+ * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/init.h>
+#include <linux/rtc.h>
+#include "power.h"
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending.  Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS    5
+static unsigned long suspend_test_start_time;
+void suspend_test_start(void)
+{
+        /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+         * What we want is a hardware counter that will work correctly even
+         * during the irqs-are-off stages of the suspend/resume cycle...
+         */
+        suspend_test_start_time = jiffies;
+}
+void suspend_test_finish(const char *label)
+{
+        long nj = jiffies - suspend_test_start_time;
+        unsigned msec;
+        msec = jiffies_to_msecs(abs(nj));
+        pr_info("PM: %s took %d.%03d seconds\n", label,
+                        msec / 1000, msec % 1000);
+        /* Warning on suspend means the RTC alarm period needs to be
+         * larger -- the system was sooo slooowwww to suspend that the
+         * alarm (should have) fired before the system went to sleep!
+         *
+         * Warning on either suspend or resume also means the system
+         * has some performance issues.  The stack dump of a WARN_ON
+         * is more likely to get the right attention than a printk...
+         */
+        WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+}
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system.  RTCs wake alarms are a common self-contained mechanism.
+ */
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+        static char err_readtime[] __initdata =
+                KERN_ERR "PM: can't read %s time, err %d\n";
+        static char err_wakealarm [] __initdata =
+                KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+        static char err_suspend[] __initdata =
+                KERN_ERR "PM: suspend test failed, error %d\n";
+        static char info_test[] __initdata =
+                KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+        unsigned long           now;
+        struct rtc_wkalrm       alm;
+        int                     status;
+        /* this may fail if the RTC hasn't been initialized */
+        status = rtc_read_time(rtc, &alm.time);
+        if (status < 0) {
+                printk(err_readtime, dev_name(&rtc->dev), status);
+                return;
+        }
+        rtc_tm_to_time(&alm.time, &now);
+        memset(&alm, 0, sizeof alm);
+        rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+        alm.enabled = true;
+        status = rtc_set_alarm(rtc, &alm);
+        if (status < 0) {
+                printk(err_wakealarm, dev_name(&rtc->dev), status);
+                return;
+        }
+        if (state == PM_SUSPEND_MEM) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+                if (status == -ENODEV)
+                        state = PM_SUSPEND_STANDBY;
+        }
+        if (state == PM_SUSPEND_STANDBY) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+        }
+        if (status < 0)
+                printk(err_suspend, status);
+        /* Some platforms can't detect that the alarm triggered the
+         * wakeup, or (accordingly) disable it after it afterwards.
+         * It's supposed to give oneshot behavior; cope.
+         */
+        alm.enabled = false;
+        rtc_set_alarm(rtc, &alm);
+}
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+        struct rtc_device *candidate = to_rtc_device(dev);
+        if (!candidate->ops->set_alarm)
+                return 0;
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
+        *(const char **)name_ptr = dev_name(dev);
+        return 1;
+}
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time.  They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static char warn_bad_state[] __initdata =
+        KERN_WARNING "PM: can't test '%s' suspend state\n";
+static int __init setup_test_suspend(char *value)
+{
+        unsigned i;
+        /* "=mem" ==> "mem" */
+        value++;
+        for (i = 0; i < PM_SUSPEND_MAX; i++) {
+                if (!pm_states[i])
+                        continue;
+                if (strcmp(pm_states[i], value) != 0)
+                        continue;
+                test_state = (__force suspend_state_t) i;
+                return 0;
+        }
+        printk(warn_bad_state, value);
+        return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+static int __init test_suspend(void)
+{
+        static char             warn_no_rtc[] __initdata =
+                KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+        char                    *pony = NULL;
+        struct rtc_device       *rtc = NULL;
+        /* PM is initialized by now; is that state testable? */
+        if (test_state == PM_SUSPEND_ON)
+                goto done;
+        if (!valid_state(test_state)) {
+                printk(warn_bad_state, pm_states[test_state]);
+                goto done;
+        }
+        /* RTCs have initialized by now too ... can we use one? */
+        class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+        if (pony)
+                rtc = rtc_class_open(pony);
+        if (!rtc) {
+                printk(warn_no_rtc);
+                goto done;
+        }
+        /* go for it */
+        test_wakealarm(rtc, test_state);
+        rtc_class_close(rtc);
+done:
+        return 0;
+}
+late_initcall(test_suspend);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 78c35047586d..6a07f4dbf2f8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -55,14 +55,6 @@
 #include "power.h"
-/*
- * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N bytes, but if that is impossible, it will
- * try to create the smallest image possible.
- */
-unsigned long image_size = 500 * 1024 * 1024;
 int in_suspend __nosavedata = 0;
 /**
@@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
                        centisecs / 100, centisecs % 100,
                        kps / 1000, (kps % 1000) / 10);
 }
-/**
- *      swsusp_shrink_memory -  Try to free as much memory as needed
- *
- *      ... but do not OOM-kill anyone
- *
- *      Notice: all userland should be stopped before it is called, or
- *      livelock is possible.
- */
-#define SHRINK_BITE     10000
-static inline unsigned long __shrink_memory(long tmp)
-{
-        if (tmp > SHRINK_BITE)
-                tmp = SHRINK_BITE;
-        return shrink_all_memory(tmp);
-}
-int swsusp_shrink_memory(void)
-{
-        long tmp;
-        struct zone *zone;
-        unsigned long pages = 0;
-        unsigned int i = 0;
-        char *p = "-\\|/";
-        struct timeval start, stop;
-        printk(KERN_INFO "PM: Shrinking memory...  ");
-        do_gettimeofday(&start);
-        do {
-                long size, highmem_size;
-                highmem_size = count_highmem_pages();
-                size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
-                tmp = size;
-                size += highmem_size;
-                for_each_populated_zone(zone) {
-                        tmp += snapshot_additional_pages(zone);
-                        if (is_highmem(zone)) {
-                                highmem_size -=
-                                        zone_page_state(zone, NR_FREE_PAGES);
-                        } else {
-                                tmp -= zone_page_state(zone, NR_FREE_PAGES);
-                                tmp += zone->lowmem_reserve[ZONE_NORMAL];
-                        }
-                }
-                if (highmem_size < 0)
-                        highmem_size = 0;
-                tmp += highmem_size;
-                if (tmp > 0) {
-                        tmp = __shrink_memory(tmp);
-                        if (!tmp)
-                                return -ENOMEM;
-                        pages += tmp;
-                } else if (size > image_size / PAGE_SIZE) {
-                        tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
-                        pages += tmp;
-                }
-                printk("\b%c", p[i++%4]);
-        } while (tmp > 0);
-        do_gettimeofday(&stop);
-        printk("\bdone (%lu pages freed)\n", pages);
-        swsusp_show_speed(&start, &stop, pages, "Freed");
-        return 0;
-}
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-struct nvs_page {
-        unsigned long phys_start;
-        unsigned int size;
-        void *kaddr;
-        void *data;
-        struct list_head node;
-};
-static LIST_HEAD(nvs_list);
-/**
- *      hibernate_nvs_register - register platform NVS memory region to save
- *      @start - physical address of the region
- *      @size - size of the region
- *
- *      The NVS region need not be page-aligned (both ends) and we arrange
- *      things so that the data from page-aligned addresses in this region will
- *      be copied into separate RAM pages.
- */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
-{
-        struct nvs_page *entry, *next;
-        while (size > 0) {
-                unsigned int nr_bytes;
-                entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-                if (!entry)
-                        goto Error;
-                list_add_tail(&entry->node, &nvs_list);
-                entry->phys_start = start;
-                nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-                entry->size = (size < nr_bytes) ? size : nr_bytes;
-                start += entry->size;
-                size -= entry->size;
-        }
-        return 0;
- Error:
-        list_for_each_entry_safe(entry, next, &nvs_list, node) {
-                list_del(&entry->node);
-                kfree(entry);
-        }
-        return -ENOMEM;
-}
-/**
- *      hibernate_nvs_free - free data pages allocated for saving NVS regions
- */
-void hibernate_nvs_free(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        free_page((unsigned long)entry->data);
-                        entry->data = NULL;
-                        if (entry->kaddr) {
-                                iounmap(entry->kaddr);
-                                entry->kaddr = NULL;
-                        }
-                }
-}
-/**
- *      hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int hibernate_nvs_alloc(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node) {
-                entry->data = (void *)__get_free_page(GFP_KERNEL);
-                if (!entry->data) {
-                        hibernate_nvs_free();
-                        return -ENOMEM;
-                }
-        }
-        return 0;
-}
-/**
- *      hibernate_nvs_save - save NVS memory regions
- */
-void hibernate_nvs_save(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Saving platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        entry->kaddr = ioremap(entry->phys_start, entry->size);
-                        memcpy(entry->data, entry->kaddr, entry->size);
-                }
-}
-/**
- *      hibernate_nvs_restore - restore NVS memory regions
- *
- *      This function is going to be called with interrupts disabled, so it
- *      cannot iounmap the virtual addresses used to access the NVS region.
- */
-void hibernate_nvs_restore(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data)
-                        memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <linux/smp_lock.h>
 #include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c67..b4d97b54c1ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                                  sizeof(printk_buf) - printed_len, fmt, args);
+        p = printk_buf;
+        /* Do we have a loglevel in the string? */
+        if (p[0] == '<') {
+                unsigned char c = p[1];
+                if (c && p[2] == '>') {
+                        switch (c) {
+                        case '0' ... '7': /* loglevel */
+                                current_log_level = c - '0';
+                        /* Fallthrough - make sure we're on a new line */
+                        case 'd': /* KERN_DEFAULT */
+                                if (!new_text_line) {
+                                        emit_log_char('\n');
+                                        new_text_line = 1;
+                                }
+                        /* Fallthrough - skip the loglevel */
+                        case 'c': /* KERN_CONT */
+                                p += 3;
+                                break;
+                        }
+                }
+        }
        /*
         * Copy the output into log_buf.  If the caller didn't provide
         * appropriate log level tags, we insert them here
         */
-        for (p = printk_buf; *p; p++) {
+        for ( ; *p; p++) {
                if (new_text_line) {
-                        /* If a token, set current_log_level and skip over */
-                        if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
-                            p[2] == '>') {
-                                current_log_level = p[1] - '0';
-                                p += 3;
-                                printed_len -= 3;
-                        }
                        /* Always output the token */
                        emit_log_char('<');
                        emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,23 +111,18 @@ int __ref profile_init(void)
        /* only text is profiled */
        prof_len = (_etext - _stext) >> prof_shift;
        buffer_bytes = prof_len*sizeof(atomic_t);
-        if (!slab_is_available()) {
-                prof_buffer = alloc_bootmem(buffer_bytes);
-                alloc_bootmem_cpumask_var(&prof_cpu_mask);
-                cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-                return 0;
-        }
        if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
                return -ENOMEM;
        cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-        prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
+        prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
        if (prof_buffer)
                return 0;
-        prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
+        prof_buffer = alloc_pages_exact(buffer_bytes,
+                                        GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
        if (prof_buffer)
                return 0;
@@ -371,7 +366,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
                node = cpu_to_node(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -379,7 +374,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -570,14 +565,14 @@ static int create_hash_tables(void)
                int node = cpu_to_node(cpu);
                struct page *page;
-                page = alloc_pages_node(node,
+                page = alloc_pages_exact_node(node,
                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
-                page = alloc_pages_node(node,
+                page = alloc_pages_exact_node(node,
                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
                                0);
                if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42c317874cfa..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,16 +25,6 @@
 /*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-        arch_ptrace_fork(child, clone_flags);
-}
-/*
 * ptrace a task: make the debugger its new parent and
 * move it to the ptrace list.
 *
@@ -177,66 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
-        unsigned long flags;
        audit_ptrace(task);
        retval = -EPERM;
+        if (unlikely(task->flags & PF_KTHREAD))
+                goto out;
        if (same_thread_group(task, current))
                goto out;
-        /* Protect exec's credential calculations against our interference;
+        /*
-         * SUID, SGID and LSM creds get determined differently under ptrace.
+         * Protect exec's credential calculations against our interference;
+         * interference; SUID, SGID and LSM creds get determined differently
+         * under ptrace.
         */
-        retval = mutex_lock_interruptible(&task->cred_exec_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval  < 0)
+        if (mutex_lock_interruptible(&task->cred_guard_mutex))
                goto out;
-        retval = -EPERM;
-repeat:
-        /*
-         * Nasty, nasty.
-         *
-         * We want to hold both the task-lock and the
-         * tasklist_lock for writing at the same time.
-         * But that's against the rules (tasklist_lock
-         * is taken for reading by interrupts on other
-         * cpu's that may have task_lock).
-         */
        task_lock(task);
-        if (!write_trylock_irqsave(&tasklist_lock, flags)) {
-                task_unlock(task);
-                do {
-                        cpu_relax();
-                } while (!write_can_lock(&tasklist_lock));
-                goto repeat;
-        }
-        if (!task->mm)
-                goto bad;
-        /* the same process cannot be attached many times */
-        if (task->ptrace & PT_PTRACED)
-                goto bad;
        retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+        task_unlock(task);
        if (retval)
-                goto bad;
+                goto unlock_creds;
+        write_lock_irq(&tasklist_lock);
+        retval = -EPERM;
+        if (unlikely(task->exit_state))
+                goto unlock_tasklist;
+        if (task->ptrace)
+                goto unlock_tasklist;
-        /* Go */
+        task->ptrace = PT_PTRACED;
-        task->ptrace |= PT_PTRACED;
        if (capable(CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
        send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
-bad:
-        write_unlock_irqrestore(&tasklist_lock, flags);
+        retval = 0;
-        task_unlock(task);
+unlock_tasklist:
-        mutex_unlock(&task->cred_exec_mutex);
+        write_unlock_irq(&tasklist_lock);
+unlock_creds:
+        mutex_unlock(&task->cred_guard_mutex);
 out:
        return retval;
 }
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
+{
+        int ret = -EPERM;
+        write_lock_irq(&tasklist_lock);
+        /* Are we already being traced? */
+        if (!current->ptrace) {
+                ret = security_ptrace_traceme(current->parent);
+                /*
+                 * Check PF_EXITING to ensure ->real_parent has not passed
+                 * exit_ptrace(). Otherwise we don't report the error but
+                 * pretend ->real_parent untraces us right after return.
+                 */
+                if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+                        current->ptrace = PT_PTRACED;
+                        __ptrace_link(current, current->real_parent);
+                }
+        }
+        write_unlock_irq(&tasklist_lock);
+        return ret;
+}
 /*
 * Called with irqs disabled, returns true if childs should reap themselves.
 */
@@ -418,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
 {
+        unsigned long flags;
        int error = -ESRCH;
-        read_lock(&tasklist_lock);
+        if (lock_task_sighand(child, &flags)) {
-        if (likely(child->sighand != NULL)) {
                error = -EINVAL;
-                spin_lock_irq(&child->sighand->siglock);
                if (likely(child->last_siginfo != NULL)) {
                        *info = *child->last_siginfo;
                        error = 0;
                }
-                spin_unlock_irq(&child->sighand->siglock);
+                unlock_task_sighand(child, &flags);
        }
-        read_unlock(&tasklist_lock);
        return error;
 }
 static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
 {
+        unsigned long flags;
        int error = -ESRCH;
-        read_lock(&tasklist_lock);
+        if (lock_task_sighand(child, &flags)) {
-        if (likely(child->sighand != NULL)) {
                error = -EINVAL;
-                spin_lock_irq(&child->sighand->siglock);
                if (likely(child->last_siginfo != NULL)) {
                        *child->last_siginfo = *info;
                        error = 0;
                }
-                spin_unlock_irq(&child->sighand->siglock);
+                unlock_task_sighand(child, &flags);
        }
-        read_unlock(&tasklist_lock);
        return error;
 }
@@ -575,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
        return ret;
 }
-/**
+static struct task_struct *ptrace_get_task_struct(pid_t pid)
- * ptrace_traceme  --  helper for PTRACE_TRACEME
- *
- * Performs checks and sets PT_PTRACED.
- * Should be used by all ptrace implementations for PTRACE_TRACEME.
- */
-int ptrace_traceme(void)
-{
-        int ret = -EPERM;
-        /*
-         * Are we already being traced?
-         */
-repeat:
-        task_lock(current);
-        if (!(current->ptrace & PT_PTRACED)) {
-                /*
-                 * See ptrace_attach() comments about the locking here.
-                 */
-                unsigned long flags;
-                if (!write_trylock_irqsave(&tasklist_lock, flags)) {
-                        task_unlock(current);
-                        do {
-                                cpu_relax();
-                        } while (!write_can_lock(&tasklist_lock));
-                        goto repeat;
-                }
-                ret = security_ptrace_traceme(current->parent);
-                /*
-                 * Check PF_EXITING to ensure ->real_parent has not passed
-                 * exit_ptrace(). Otherwise we don't report the error but
-                 * pretend ->real_parent untraces us right after return.
-                 */
-                if (!ret && !(current->real_parent->flags & PF_EXITING)) {
-                        current->ptrace |= PT_PTRACED;
-                        __ptrace_link(current, current->real_parent);
-                }
-                write_unlock_irqrestore(&tasklist_lock, flags);
-        }
-        task_unlock(current);
-        return ret;
-}
-/**
- * ptrace_get_task_struct  --  grab a task struct reference for ptrace
- * @pid:       process id to grab a task_struct reference of
- *
- * This function is a helper for ptrace implementations.  It checks
- * permissions and then grabs a task struct for use of the actual
- * ptrace implementation.
- *
- * Returns the task_struct for @pid or an ERR_PTR() on failure.
- */
-struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
        struct task_struct *child;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        child = find_task_by_vpid(pid);
        if (child)
                get_task_struct(child);
+        rcu_read_unlock();
-        read_unlock(&tasklist_lock);
        if (!child)
                return ERR_PTR(-ESRCH);
        return child;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d3..beb0e659adcc 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
                rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
                spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-                ret = 0;
+                ret = 0; /* unused */
                __wait_event_interruptible(rcu_ctrlblk.sched_wq,
                        rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
                        ret);
-                /*
-                 * Signals would prevent us from sleeping, and we cannot
-                 * do much with them in any case.  So flush them.
-                 */
-                if (ret)
-                        flush_signals(current);
                couldsleepnext = 0;
        } while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9b..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-        if (rdp->qs_pending)
+        if (rdp->qs_pending) {
+                rdp->n_rp_qs_pending++;
                return 1;
+        }
        /* Does this CPU have callbacks ready to invoke? */
-        if (cpu_has_callbacks_ready_to_invoke(rdp))
+        if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+                rdp->n_rp_cb_ready++;
                return 1;
+        }
        /* Has RCU gone idle with this CPU needing another grace period? */
-        if (cpu_needs_another_gp(rsp, rdp))
+        if (cpu_needs_another_gp(rsp, rdp)) {
+                rdp->n_rp_cpu_needs_gp++;
                return 1;
+        }
        /* Has another RCU grace period completed?  */
-        if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+        if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+                rdp->n_rp_gp_completed++;
                return 1;
+        }
        /* Has a new RCU grace period started? */
-        if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+        if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+                rdp->n_rp_gp_started++;
                return 1;
+        }
        /* Has an RCU GP gone long enough to send resched IPIs &c? */
        if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+                rdp->n_rp_need_fqs++;
                return 1;
+        }
        /* nothing to do */
+        rdp->n_rp_need_nothing++;
        return 0;
 }
@@ -1520,7 +1533,7 @@ void __init __rcu_init(void)
        int j;
        struct rcu_node *rnp;
-        printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+        printk(KERN_INFO "Hierarchical RCU implementation.\n");
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
        printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1533,7 +1546,6 @@ void __init __rcu_init(void)
                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
        /* Register notifier for non-boot CPUs */
        register_cpu_notifier(&rcu_nb);
-        printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
 }
 module_param(blimit, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba9404..fe1dcdbf1ca3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
        .release = single_release,
 };
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+        seq_printf(m, "%3d%cnp=%ld "
+                   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+                   rdp->cpu,
+                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+                   rdp->n_rcu_pending,
+                   rdp->n_rp_qs_pending,
+                   rdp->n_rp_cb_ready,
+                   rdp->n_rp_cpu_needs_gp,
+                   rdp->n_rp_gp_completed,
+                   rdp->n_rp_gp_started,
+                   rdp->n_rp_need_fqs,
+                   rdp->n_rp_need_nothing);
+}
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+        int cpu;
+        struct rcu_data *rdp;
+        for_each_possible_cpu(cpu) {
+                rdp = rsp->rda[cpu];
+                if (rdp->beenonline)
+                        print_one_rcu_pending(m, rdp);
+        }
+}
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+        seq_puts(m, "rcu:\n");
+        print_rcu_pendings(m, &rcu_state);
+        seq_puts(m, "rcu_bh:\n");
+        print_rcu_pendings(m, &rcu_bh_state);
+        return 0;
+}
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcu_pending, NULL);
+}
+static struct file_operations rcu_pending_fops = {
+        .owner = THIS_MODULE,
+        .open = rcu_pending_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
 static int __init rcuclassic_trace_init(void)
 {
        rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
                                                NULL, &rcuhier_fops);
        if (!hierdir)
                goto free_out;
+        rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+                                                NULL, &rcu_pending_fops);
+        if (!rcu_pendingdir)
+                goto free_out;
        return 0;
 free_out:
        if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
        debugfs_remove(datadir_csv);
        debugfs_remove(gpdir);
        debugfs_remove(hierdir);
+        debugfs_remove(rcu_pendingdir);
        debugfs_remove(rcudir);
 }
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c803..e1338f074314 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
        spin_lock_init(&counter->lock);
-        counter->limit = (unsigned long long)LLONG_MAX;
+        counter->limit = RESOURCE_MAX;
        counter->parent = parent;
 }
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
                                        unsigned long long *res)
 {
        char *end;
+        /* return RESOURCE_MAX(unlimited) if "-1" is specified */
+        if (*buf == '-') {
+                *res = simple_strtoull(buf + 1, &end, 10);
+                if (*res != 1 || *end != '\0')
+                        return -EINVAL;
+                *res = RESOURCE_MAX;
+                return 0;
+        }
        /* FIXME - make memparse() take const char* args */
        *res = memparse((char *)buf, &end);
        if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
        static struct resource reserve[MAXRESERVE];
        for (;;) {
-                int io_start, io_num;
+                unsigned int io_start, io_num;
                int x = reserved;
                if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 * assigned pending owner [which might not have taken the
 * lock yet]:
 */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+                                    struct task_struct *task)
 {
        struct task_struct *pendowner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
        if (!rt_mutex_owner_pending(lock))
                return 0;
-        if (pendowner == current)
+        if (pendowner == task)
                return 1;
        spin_lock_irqsave(&pendowner->pi_lock, flags);
-        if (current->prio >= pendowner->prio) {
+        if (task->prio >= pendowner->prio) {
                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 0;
        }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
         * We are going to steal the lock and a waiter was
         * enqueued on the pending owners pi_waiters queue. So
         * we have to enqueue this waiter into
-         * current->pi_waiters list. This covers the case,
+         * task->pi_waiters list. This covers the case,
-         * where current is boosted because it holds another
+         * where task is boosted because it holds another
         * lock and gets unboosted because the booster is
         * interrupted, so we would delay a waiter with higher
-         * priority as current->normal_prio.
+         * priority as task->normal_prio.
         *
         * Note: in the rare case of a SCHED_OTHER task changing
         * its priority and thus stealing the lock, next->task
-         * might be current:
+         * might be task:
         */
-        if (likely(next->task != current)) {
+        if (likely(next->task != task)) {
-                spin_lock_irqsave(&current->pi_lock, flags);
+                spin_lock_irqsave(&task->pi_lock, flags);
-                plist_add(&next->pi_list_entry, &current->pi_waiters);
+                plist_add(&next->pi_list_entry, &task->pi_waiters);
-                __rt_mutex_adjust_prio(current);
+                __rt_mutex_adjust_prio(task);
-                spin_unlock_irqrestore(&current->pi_lock, flags);
+                spin_unlock_irqrestore(&task->pi_lock, flags);
        }
        return 1;
 }
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
         */
        mark_rt_mutex_waiters(lock);
-        if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+        if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
                return 0;
        /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                                   struct rt_mutex_waiter *waiter,
+                                   struct task_struct *task,
                                   int detect_deadlock)
 {
        struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0, res;
-        spin_lock_irqsave(&current->pi_lock, flags);
+        spin_lock_irqsave(&task->pi_lock, flags);
-        __rt_mutex_adjust_prio(current);
+        __rt_mutex_adjust_prio(task);
-        waiter->task = current;
+        waiter->task = task;
        waiter->lock = lock;
-        plist_node_init(&waiter->list_entry, current->prio);
+        plist_node_init(&waiter->list_entry, task->prio);
-        plist_node_init(&waiter->pi_list_entry, current->prio);
+        plist_node_init(&waiter->pi_list_entry, task->prio);
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
                top_waiter = rt_mutex_top_waiter(lock);
        plist_add(&waiter->list_entry, &lock->wait_list);
-        current->pi_blocked_on = waiter;
+        task->pi_blocked_on = waiter;
-        spin_unlock_irqrestore(&current->pi_lock, flags);
+        spin_unlock_irqrestore(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
                spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        spin_unlock(&lock->wait_lock);
        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-                                         current);
+                                         task);
        spin_lock(&lock->wait_lock);
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
-/*
+/**
- * Slow path lock function:
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock:                the rt_mutex to take
+ * @state:               the state the task should block in (TASK_INTERRUPTIBLE
+ *                       or TASK_UNINTERRUPTIBLE)
+ * @timeout:             the pre-initialized and started timer, or NULL for none
+ * @waiter:              the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:     passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
 */
 static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
-                  struct hrtimer_sleeper *timeout,
+                    struct hrtimer_sleeper *timeout,
-                  int detect_deadlock)
+                    struct rt_mutex_waiter *waiter,
+                    int detect_deadlock)
 {
-        struct rt_mutex_waiter waiter;
        int ret = 0;
-        debug_rt_mutex_init_waiter(&waiter);
-        waiter.task = NULL;
-        spin_lock(&lock->wait_lock);
-        /* Try to acquire the lock again: */
-        if (try_to_take_rt_mutex(lock)) {
-                spin_unlock(&lock->wait_lock);
-                return 0;
-        }
-        set_current_state(state);
-        /* Setup the timer, when timeout != NULL */
-        if (unlikely(timeout)) {
-                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-                if (!hrtimer_active(&timeout->timer))
-                        timeout->task = NULL;
-        }
        for (;;) {
                /* Try to acquire the lock: */
                if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                }
                /*
-                 * waiter.task is NULL the first time we come here and
+                 * waiter->task is NULL the first time we come here and
                 * when we have been woken up by the previous owner
                 * but the lock got stolen by a higher prio task.
                 */
-                if (!waiter.task) {
+                if (!waiter->task) {
-                        ret = task_blocks_on_rt_mutex(lock, &waiter,
+                        ret = task_blocks_on_rt_mutex(lock, waiter, current,
                                                      detect_deadlock);
                        /*
                         * If we got woken up by the owner then start loop
                         * all over without going into schedule to try
                         * to get the lock now:
                         */
-                        if (unlikely(!waiter.task)) {
+                        if (unlikely(!waiter->task)) {
                                /*
                                 * Reset the return value. We might
                                 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                spin_unlock(&lock->wait_lock);
-                debug_rt_mutex_print_deadlock(&waiter);
+                debug_rt_mutex_print_deadlock(waiter);
-                if (waiter.task)
+                if (waiter->task)
                        schedule_rt_mutex(lock);
                spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
+        return ret;
+}
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                  struct hrtimer_sleeper *timeout,
+                  int detect_deadlock)
+{
+        struct rt_mutex_waiter waiter;
+        int ret = 0;
+        debug_rt_mutex_init_waiter(&waiter);
+        waiter.task = NULL;
+        spin_lock(&lock->wait_lock);
+        /* Try to acquire the lock again: */
+        if (try_to_take_rt_mutex(lock)) {
+                spin_unlock(&lock->wait_lock);
+                return 0;
+        }
+        set_current_state(state);
+        /* Setup the timer, when timeout != NULL */
+        if (unlikely(timeout)) {
+                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+                if (!hrtimer_active(&timeout->timer))
+                        timeout->task = NULL;
+        }
+        ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+                                  detect_deadlock);
        set_current_state(TASK_RUNNING);
        if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 /**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
- *                                     the timeout structure is provided
+ *                      the timeout structure is provided
- *                                     by the caller
+ *                      by the caller
 *
 * @lock:               the rt_mutex to be locked
 * @timeout:            timeout structure or NULL (no timeout)
@@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 * Returns:
 *  0           on success
 * -EINTR       when interrupted by a signal
- * -ETIMEOUT    when the timeout expired
+ * -ETIMEDOUT   when the timeout expired
 * -EDEADLK     when the lock would deadlock (when deadlock detection is on)
 */
 int
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-/***
+/**
 * rt_mutex_destroy - mark a mutex unusable
 * @lock: the mutex to be destroyed
 *
@@ -986,6 +1013,57 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 }
 /**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:               the rt_mutex to take
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @task:               the task to prepare
+ * @detect_deadlock:    perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                              struct rt_mutex_waiter *waiter,
+                              struct task_struct *task, int detect_deadlock)
+{
+        int ret;
+        spin_lock(&lock->wait_lock);
+        mark_rt_mutex_waiters(lock);
+        if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+                /* We got the lock for task. */
+                debug_rt_mutex_lock(lock);
+                rt_mutex_set_owner(lock, task, 0);
+                spin_unlock(&lock->wait_lock);
+                rt_mutex_deadlock_account_lock(lock, task);
+                return 1;
+        }
+        ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+        if (ret && !waiter->task) {
+                /*
+                 * Reset the return value. We might have
+                 * returned with -EDEADLK and the owner
+                 * released the lock while we were walking the
+                 * pi chain.  Let the waiter sort it out.
+                 */
+                ret = 0;
+        }
+        spin_unlock(&lock->wait_lock);
+        debug_rt_mutex_print_deadlock(waiter);
+        return ret;
+}
+/**
 * rt_mutex_next_owner - return the next owner of the lock
 *
 * @lock: the rt lock query
@@ -1004,3 +1082,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
        return rt_mutex_top_waiter(lock)->task;
 }
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock:               the rt_mutex we were woken on
+ * @to:                 the timeout, null if none. hrtimer should already have
+ *                      been started.
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:    perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ *  0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                               struct hrtimer_sleeper *to,
+                               struct rt_mutex_waiter *waiter,
+                               int detect_deadlock)
+{
+        int ret;
+        spin_lock(&lock->wait_lock);
+        set_current_state(TASK_INTERRUPTIBLE);
+        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+                                  detect_deadlock);
+        set_current_state(TASK_RUNNING);
+        if (unlikely(waiter->task))
+                remove_waiter(lock, waiter);
+        /*
+         * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+         * have to fix that up.
+         */
+        fixup_rt_mutex_waiters(lock);
+        spin_unlock(&lock->wait_lock);
+        /*
+         * Readjust priority, when we did not get the lock. We might have been
+         * the pending owner and boosted. Since we did not take the lock, the
+         * PI boost has to go.
+         */
+        if (unlikely(ret))
+                rt_mutex_adjust_prio(current);
+        return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                     struct rt_mutex_waiter *waiter,
+                                     struct task_struct *task,
+                                     int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                                      struct hrtimer_sleeper *to,
+                                      struct rt_mutex_waiter *waiter,
+                                      int detect_deadlock);
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -68,17 +69,18 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include "sched_cpupri.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
 */
 #define RUNTIME_INF     ((u64)~0ULL)
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
 #ifdef CONFIG_SMP
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
                delta = ktime_to_ns(ktime_sub(hard, soft));
                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                                HRTIMER_MODE_ABS, 0);
+                                HRTIMER_MODE_ABS_PINNED, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -497,6 +493,7 @@ struct rt_rq {
 #endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
+        unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
 #endif
@@ -584,6 +581,7 @@ struct rq {
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
+        u64 nr_migrations_in;
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -630,6 +628,10 @@ struct rq {
        struct list_head migration_queue;
 #endif
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
        int hrtick_csd_pending;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -1154,7 +1156,7 @@ static __init void init_hrtick(void)
 static void hrtick_start(struct rq *rq, u64 delay)
 {
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                        HRTIMER_MODE_REL, 0);
+                        HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
@@ -1728,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
+static void calc_load_account_active(struct rq *this_rq);
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1958,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        clock_offset = old_rq->clock - new_rq->clock;
-        trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+        trace_sched_migrate_task(p, new_cpu);
 #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
@@ -1967,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
+#endif
        if (old_cpu != new_cpu) {
-                schedstat_inc(p, se.nr_migrations);
+                p->se.nr_migrations++;
+                new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
-        }
 #endif
+                perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+                                     1, 1, NULL, 0);
+        }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
@@ -2015,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 }
 /*
+ * wait_task_context_switch -   wait for a thread to complete at least one
+ *                              context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+        unsigned long nvcsw, nivcsw, flags;
+        int running;
+        struct rq *rq;
+        nvcsw   = p->nvcsw;
+        nivcsw  = p->nivcsw;
+        for (;;) {
+                /*
+                 * The runqueue is assigned before the actual context
+                 * switch. We need to take the runqueue lock.
+                 *
+                 * We could check initially without the lock but it is
+                 * very likely that we need to take the lock in every
+                 * iteration.
+                 */
+                rq = task_rq_lock(p, &flags);
+                running = task_running(rq, p);
+                task_rq_unlock(rq, &flags);
+                if (likely(!running))
+                        break;
+                /*
+                 * The switch count is incremented before the actual
+                 * context switch. We thus wait for two switches to be
+                 * sure at least one completed.
+                 */
+                if ((p->nvcsw - nvcsw) > 1)
+                        break;
+                if ((p->nivcsw - nivcsw) > 1)
+                        break;
+                cpu_relax();
+        }
+}
+/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2194,7 @@ void kick_process(struct task_struct *p)
                smp_send_reschedule(cpu);
        preempt_enable();
 }
+EXPORT_SYMBOL_GPL(kick_process);
 /*
 * Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
 #endif /* CONFIG_SMP */
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:          the task to evaluate
+ * @func:       the function to be called
+ * @info:       the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                              void (*func) (void *info), void *info)
+{
+        int cpu;
+        preempt_disable();
+        cpu = task_cpu(p);
+        if (task_curr(p))
+                smp_call_function_single(cpu, func, info, 1);
+        preempt_enable();
+}
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -2458,6 +2532,17 @@ out:
        return success;
 }
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
        return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
+        p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 #ifdef CONFIG_SCHEDSTATS
-        p->se.wait_start                = 0;
+        p->se.wait_start                        = 0;
-        p->se.sum_sleep_runtime         = 0;
+        p->se.wait_max                          = 0;
-        p->se.sleep_start               = 0;
+        p->se.wait_count                        = 0;
-        p->se.block_start               = 0;
+        p->se.wait_sum                          = 0;
-        p->se.sleep_max                 = 0;
-        p->se.block_max                 = 0;
+        p->se.sleep_start                       = 0;
-        p->se.exec_max                  = 0;
+        p->se.sleep_max                         = 0;
-        p->se.slice_max                 = 0;
+        p->se.sum_sleep_runtime                 = 0;
-        p->se.wait_max                  = 0;
+        p->se.block_start                       = 0;
+        p->se.block_max                         = 0;
+        p->se.exec_max                          = 0;
+        p->se.slice_max                         = 0;
+        p->se.nr_migrations_cold                = 0;
+        p->se.nr_failed_migrations_affine       = 0;
+        p->se.nr_failed_migrations_running      = 0;
+        p->se.nr_failed_migrations_hot          = 0;
+        p->se.nr_forced_migrations              = 0;
+        p->se.nr_forced2_migrations             = 0;
+        p->se.nr_wakeups                        = 0;
+        p->se.nr_wakeups_sync                   = 0;
+        p->se.nr_wakeups_migrate                = 0;
+        p->se.nr_wakeups_local                  = 0;
+        p->se.nr_wakeups_remote                 = 0;
+        p->se.nr_wakeups_affine                 = 0;
+        p->se.nr_wakeups_affine_attempts        = 0;
+        p->se.nr_wakeups_passive                = 0;
+        p->se.nr_wakeups_idle                   = 0;
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2710,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+        perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
        if (post_schedule)
@@ -2766,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * combine the page table reload and the switch backend into
         * one hypercall.
         */
-        arch_enter_lazy_cpu_mode();
+        arch_start_context_switch(prev);
        if (unlikely(!mm)) {
                next->active_mm = oldmm;
@@ -2856,19 +2965,81 @@ unsigned long nr_iowait(void)
        return sum;
 }
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
-        unsigned long i, running = 0, uninterruptible = 0;
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
-        for_each_online_cpu(i) {
+static unsigned long
-                running += cpu_rq(i)->nr_running;
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
-                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+{
-        }
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        return load >> FSHIFT;
+}
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+        unsigned long upd = calc_load_update + 10;
+        long active;
+        if (time_before(jiffies, upd))
+                return;
+        active = atomic_long_read(&calc_load_tasks);
+        active = active > 0 ? active * FIXED_1 : 0;
-        if (unlikely((long)uninterruptible < 0))
+        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-                uninterruptible = 0;
+        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-        return running + uninterruptible;
+        calc_load_update += LOAD_FREQ;
+}
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+        long nr_active, delta;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+                atomic_long_add(delta, &calc_load_tasks);
+        }
+}
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+        return cpu_rq(cpu)->nr_migrations_in;
 }
 /*
@@ -2899,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
+        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+                this_rq->calc_load_update += LOAD_FREQ;
+                calc_load_account_active(this_rq);
+        }
 }
 #ifdef CONFIG_SMP
@@ -4240,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
        atomic_t load_balancer;
        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
 };
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
 /*
 * This routine will try to nominate the ilb (idle load balancing)
 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick)
                        /* make me the ilb owner */
                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu)
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
                        return 1;
+                }
        } else {
                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
@@ -4468,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                }
                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /*
+                        int ilb = find_new_ilb(cpu);
-                         * simple selection for now: Nominate the
-                         * first cpu in the nohz list to be the next
-                         * ilb owner.
-                         *
-                         * TBD: Traverse the sched domains and nominate
-                         * the nearest cpu in the nohz.cpu_mask.
-                         */
-                        int ilb = cpumask_first(nohz.cpu_mask);
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -4840,6 +5145,8 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        spin_unlock(&rq->lock);
+        perf_counter_task_tick(curr, cpu);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
@@ -5007,13 +5314,15 @@ pick_next_task(struct rq *rq)
 /*
 * schedule() is the main scheduler function.
 */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
+need_resched:
+        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@ -5053,6 +5362,7 @@ need_resched_nonpreemptible:
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
+                perf_counter_task_sched_out(prev, next, cpu);
                rq->nr_switches++;
                rq->curr = next;
@@ -5070,15 +5380,9 @@ need_resched_nonpreemptible:
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
-}
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-        preempt_disable();
-        __schedule();
        preempt_enable_no_resched();
-        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+        if (need_resched())
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
@@ -5221,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function);
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
@@ -5241,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5279,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 * with each other. This can prevent needless bouncing between CPUs.
 *
 * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5315,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 * awakened in the same order in which they were queued.
 *
 * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void complete(struct completion *x)
 {
@@ -5332,6 +5645,9 @@ EXPORT_SYMBOL(complete);
 * @x:  holds the state of this particular completion
 *
 * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void complete_all(struct completion *x)
 {
@@ -6248,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
+static inline int should_resched(void)
+{
+        return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6267,8 +6588,7 @@ static void __cond_resched(void)
 int __sched _cond_resched(void)
 {
-        if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
+        if (should_resched()) {
-                                        system_state == SYSTEM_RUNNING) {
                __cond_resched();
                return 1;
        }
@@ -6286,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
 */
 int cond_resched_lock(spinlock_t *lock)
 {
-        int resched = need_resched() && system_state == SYSTEM_RUNNING;
+        int resched = should_resched();
        int ret = 0;
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
-                if (resched && need_resched())
+                if (resched)
                        __cond_resched();
                else
                        cpu_relax();
@@ -6306,7 +6626,7 @@ int __sched cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        if (need_resched() && system_state == SYSTEM_RUNNING) {
+        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
                local_bh_disable();
@@ -6490,8 +6810,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
-        printk(KERN_CONT "%5lu %5d %6d\n", free,
+        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(p->real_parent));
+                task_pid_nr(p), task_pid_nr(p->real_parent),
+                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
 }
@@ -6752,7 +7073,7 @@ static int migration_thread(void *data)
                if (cpu_is_offline(cpu)) {
                        spin_unlock_irq(&rq->lock);
-                        goto wait_to_die;
+                        break;
                }
                if (rq->active_balance) {
@@ -6778,16 +7099,7 @@ static int migration_thread(void *data)
                complete(&req->done);
        }
        __set_current_state(TASK_RUNNING);
-        return 0;
-wait_to_die:
-        /* Wait for kthread_stop */
-        set_current_state(TASK_INTERRUPTIBLE);
-        while (!kthread_should_stop()) {
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        __set_current_state(TASK_RUNNING);
        return 0;
 }
@@ -6970,6 +7282,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        }
 }
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+        rq->calc_load_active = 0;
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7193,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = task_rq_lock(p, &flags);
                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
                task_rq_unlock(rq, &flags);
+                get_task_struct(p);
                cpu_rq(cpu)->migration_thread = p;
+                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
@@ -7221,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                kthread_bind(cpu_rq(cpu)->migration_thread,
                             cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
+                put_task_struct(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
@@ -7230,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
+                put_task_struct(rq->migration_thread);
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                spin_lock_irq(&rq->lock);
@@ -7243,7 +7568,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
+                calc_global_load_remove(rq);
                /*
                 * No need to migrate the tasks: it was best-effort if
                 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-/* Register at highest priority so that task migration (migrate_all_tasks)
+/*
- * happens before everything else.
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
 */
 static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
@@ -7523,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                free_rootdomain(old_rd);
 }
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+        gfp_t gfp = GFP_KERNEL;
        memset(rd, 0, sizeof(*rd));
-        if (bootmem) {
+        if (bootmem)
-                alloc_bootmem_cpumask_var(&def_root_domain.span);
+                gfp = GFP_NOWAIT;
-                alloc_bootmem_cpumask_var(&def_root_domain.online);
-                alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-                cpupri_init(&rd->cpupri, true);
-                return 0;
-        }
-        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->span, gfp))
                goto out;
-        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->online, gfp))
                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->rto_mask, gfp))
                goto free_online;
-        if (cpupri_init(&rd->cpupri, false) != 0)
+        if (cpupri_init(&rd->cpupri, bootmem) != 0)
                goto free_rto_mask;
        return 0;
@@ -7753,8 +8077,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
 * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ *
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
 */
 struct static_sched_group {
        struct sched_group sg;
@@ -7875,7 +8200,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                        struct sched_domain *sd;
                        sd = &per_cpu(phys_domains, j).sd;
-                        if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                        if (j != group_first_cpu(sd->groups)) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7953,7 +8278,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        WARN_ON(!sd || !sd->groups);
-        if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+        if (cpu != group_first_cpu(sd->groups))
                return;
        child = sd->child;
@@ -8731,6 +9056,8 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
+const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
        return in_lock_functions(addr) ||
@@ -8770,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-        plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+        plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
 #endif
        rt_rq->rt_time = 0;
@@ -8865,7 +9192,7 @@ void __init sched_init(void)
         * we use alloc_bootmem().
         */
        if (alloc_size) {
-                ptr = (unsigned long)alloc_bootmem(alloc_size);
+                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9265,8 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
                rq->nr_running = 0;
+                rq->calc_load_active = 0;
+                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9287,7 @@ void __init sched_init(void)
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
-                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
                 * We achieve this by letting init_task_group's tasks sit
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9374,26 @@ void __init sched_init(void)
         * when this runqueue becomes "idle".
         */
        init_idle(current, smp_processor_id());
+        calc_load_update = jiffies + LOAD_FREQ;
        /*
         * During early bootup we pretend to be a normal task:
         */
        current->sched_class = &fair_sched_class;
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-        alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+        alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-        alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+        alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-        alloc_bootmem_cpumask_var(&cpu_isolated_map);
+        alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
+        perf_counter_init();
        scheduler_running = 1;
 }
@@ -9800,6 +10135,13 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
+        /*
+         * There's always some RT tasks in the root group
+         * -- migration, kstopmachine etc..
+         */
+        if (sysctl_sched_rt_runtime == 0)
+                return -EBUSY;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a5e3ed..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
-                if (lowest_mask)
+                if (lowest_mask) {
                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                        /*
+                         * We have to ensure that we have at least one bit
+                         * still set in the array, since the map could have
+                         * been concurrently emptied between the first and
+                         * second reads of vec->mask.  If we hit this
+                         * condition, simply act as though we never hit this
+                         * priority level and continue on.
+                         */
+                        if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+                                continue;
+                }
                return 1;
        }
@@ -152,10 +165,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 *
 * Returns: -ENOMEM if memory fails.
 */
-int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp, bool bootmem)
 {
+        gfp_t gfp = GFP_KERNEL;
        int i;
+        if (bootmem)
+                gfp = GFP_NOWAIT;
        memset(cp, 0, sizeof(*cp));
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +180,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
                spin_lock_init(&vec->lock);
                vec->count = 0;
-                if (bootmem)
+                if (!zalloc_cpumask_var(&vec->mask, gfp))
-                        alloc_bootmem_cpumask_var(&vec->mask);
-                else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
                spread, rq0_min_vruntime, spread0;
-        struct rq *rq = &per_cpu(runqueues, cpu);
+        struct rq *rq = cpu_rq(cpu);
        struct sched_entity *last;
        unsigned long flags;
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        if (last)
                max_vruntime = last->vruntime;
        min_vruntime = cfs_rq->min_vruntime;
-        rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
        spin_unlock_irqrestore(&rq->lock, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                        SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 static void print_cpu(struct seq_file *m, int cpu)
 {
-        struct rq *rq = &per_cpu(runqueues, cpu);
+        struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_X86
        {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
        return min_vruntime;
 }
+static inline int entity_before(struct sched_entity *a,
+                                struct sched_entity *b)
+{
+        return (s64)(a->vruntime - b->vruntime) < 0;
+}
 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        return se->vruntime - cfs_rq->min_vruntime;
@@ -430,12 +436,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        for_each_sched_entity(se) {
                struct load_weight *load;
+                struct load_weight lw;
                cfs_rq = cfs_rq_of(se);
                load = &cfs_rq->load;
                if (unlikely(!se->on_rq)) {
-                        struct load_weight lw = cfs_rq->load;
+                        lw = cfs_rq->load;
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
@@ -604,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
+        struct task_struct *tsk = NULL;
+        if (entity_is_task(se))
+                tsk = task_of(se);
        if (se->sleep_start) {
                u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
-                struct task_struct *tsk = task_of(se);
                if ((s64)delta < 0)
                        delta = 0;
@@ -617,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                se->sleep_start = 0;
                se->sum_sleep_runtime += delta;
-                account_scheduler_latency(tsk, delta >> 10, 1);
+                if (tsk)
+                        account_scheduler_latency(tsk, delta >> 10, 1);
        }
        if (se->block_start) {
                u64 delta = rq_of(cfs_rq)->clock - se->block_start;
-                struct task_struct *tsk = task_of(se);
                if ((s64)delta < 0)
                        delta = 0;
@@ -632,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                se->block_start = 0;
                se->sum_sleep_runtime += delta;
-                /*
+                if (tsk) {
-                 * Blocking time is in units of nanosecs, so shift by 20 to
+                        /*
-                 * get a milliseconds-range estimation of the amount of
+                         * Blocking time is in units of nanosecs, so shift by
-                 * time that the task spent sleeping:
+                         * 20 to get a milliseconds-range estimation of the
-                 */
+                         * amount of time that the task spent sleeping:
-                if (unlikely(prof_on == SLEEP_PROFILING)) {
+                         */
+                        if (unlikely(prof_on == SLEEP_PROFILING)) {
-                        profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
+                                profile_hits(SLEEP_PROFILING,
-                                     delta >> 20);
+                                                (void *)get_wchan(tsk),
+                                                delta >> 20);
+                        }
+                        account_scheduler_latency(tsk, delta >> 10, 0);
                }
-                account_scheduler_latency(tsk, delta >> 10, 0);
        }
 #endif
 }
@@ -686,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                         * all of which have the same weight.
                         */
                        if (sched_feat(NORMALIZED_SLEEPER) &&
-                                        task_of(se)->policy != SCHED_IDLE)
+                                        (!entity_is_task(se) ||
+                                         task_of(se)->policy != SCHED_IDLE))
                                thresh = calc_delta_fair(thresh, se);
                        vruntime -= thresh;
@@ -1015,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
        /*
         * Already in the rightmost position?
         */
-        if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
+        if (unlikely(!rightmost || entity_before(rightmost, se)))
                return;
        /*
@@ -1487,17 +1501,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
        find_matching_se(&se, &pse);
-        while (se) {
+        BUG_ON(!pse);
-                BUG_ON(!pse);
-                if (wakeup_preempt_entity(se, pse) == 1) {
-                        resched_task(curr);
-                        break;
-                }
-                se = parent_entity(se);
+        if (wakeup_preempt_entity(se, pse) == 1)
-                pse = parent_entity(pse);
+                resched_task(curr);
-        }
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1718,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
        /* 'curr' will be NULL if the child belongs to a different group */
        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && curr->vruntime < se->vruntime) {
+                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
+        /* adjust the active tasks as we might go into a long sleep */
+        calc_load_account_active(rq);
        return rq->idle;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 #ifdef CONFIG_RT_GROUP_SCHED
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
        return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 #else /* CONFIG_RT_GROUP_SCHED */
+#define rt_entity_is_task(rt_se) (1)
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
        return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
 static void update_rt_migration(struct rt_rq *rt_rq)
 {
-        if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
                if (!rt_rq->overloaded) {
                        rt_set_overload(rq_of_rt_rq(rt_rq));
                        rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        if (!rt_entity_is_task(rt_se))
+                return;
+        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+        rt_rq->rt_nr_total++;
        if (rt_se->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory++;
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        if (!rt_entity_is_task(rt_se))
+                return;
+        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+        rt_rq->rt_nr_total--;
        if (rt_se->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4c..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -41,8 +41,6 @@
 static struct kmem_cache *sigqueue_cachep;
-DEFINE_TRACE(sched_signal_send);
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
        return t->sighand->action[sig - 1].sa.sa_handler;
@@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
 /*
 * Flush all pending signals for a task.
 */
+void __flush_signals(struct task_struct *t)
+{
+        clear_tsk_thread_flag(t, TIF_SIGPENDING);
+        flush_sigqueue(&t->pending);
+        flush_sigqueue(&t->signal->shared_pending);
+}
 void flush_signals(struct task_struct *t)
 {
        unsigned long flags;
        spin_lock_irqsave(&t->sighand->siglock, flags);
-        clear_tsk_thread_flag(t, TIF_SIGPENDING);
+        __flush_signals(t);
-        flush_sigqueue(&t->pending);
-        flush_sigqueue(&t->signal->shared_pending);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
@@ -829,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
        struct sigpending *pending;
        struct sigqueue *q;
+        int override_rlimit;
        trace_sched_signal_send(sig, t);
@@ -860,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
           make sure at least one signal gets delivered and don't
           pass on the info struct.  */
-        q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
+        if (sig < SIGRTMIN)
-                                             (is_si_special(info) ||
+                override_rlimit = (is_si_special(info) || info->si_code >= 0);
-                                              info->si_code >= 0)));
+        else
+                override_rlimit = 0;
+        q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+                override_rlimit);
        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
@@ -1402,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        /* do_notify_parent_cldstop should have been called instead.  */
        BUG_ON(task_is_stopped_or_traced(tsk));
-        BUG_ON(!tsk->ptrace &&
+        BUG_ON(!task_ptrace(tsk) &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
        info.si_signo = sig;
@@ -1441,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
-        if (!tsk->ptrace && sig == SIGCHLD &&
+        if (!task_ptrace(tsk) && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
@@ -1478,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        struct task_struct *parent;
        struct sighand_struct *sighand;
-        if (tsk->ptrace & PT_PTRACED)
+        if (task_ptrace(tsk))
                parent = tsk->parent;
        else {
                tsk = tsk->group_leader;
@@ -1491,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
         * see comment in do_notify_parent() abot the following 3 lines
         */
        rcu_read_lock();
-        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
        info.si_uid = __task_cred(tsk)->uid;
        rcu_read_unlock();
@@ -1527,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 static inline int may_ptrace_stop(void)
 {
-        if (!likely(current->ptrace & PT_PTRACED))
+        if (!likely(task_ptrace(current)))
                return 0;
        /*
         * Are we in the middle of do_coredump?
@@ -1745,7 +1753,7 @@ static int do_signal_stop(int signr)
 static int ptrace_signal(int signr, siginfo_t *info,
                         struct pt_regs *regs, void *cookie)
 {
-        if (!(current->ptrace & PT_PTRACED))
+        if (!task_ptrace(current))
                return signr;
        ptrace_signal_deliver(regs, cookie);
@@ -2278,24 +2286,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
        return kill_something_info(sig, &info, pid);
 }
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
-        int error;
-        struct siginfo info;
        struct task_struct *p;
        unsigned long flags;
+        int error = -ESRCH;
-        error = -ESRCH;
-        info.si_signo = sig;
-        info.si_errno = 0;
-        info.si_code = SI_TKILL;
-        info.si_pid = task_tgid_vnr(current);
-        info.si_uid = current_uid();
        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
-                error = check_kill_permission(sig, &info, p);
+                error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
@@ -2305,7 +2306,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
                 * signal is private anyway.
                 */
                if (!error && sig && lock_task_sighand(p, &flags)) {
-                        error = specific_send_sig_info(sig, &info, p);
+                        error = specific_send_sig_info(sig, info, p);
                        unlock_task_sighand(p, &flags);
                }
        }
@@ -2314,6 +2315,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
        return error;
 }
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+        struct siginfo info;
+        info.si_signo = sig;
+        info.si_errno = 0;
+        info.si_code = SI_TKILL;
+        info.si_pid = task_tgid_vnr(current);
+        info.si_uid = current_uid();
+        return do_send_specific(tgid, pid, sig, &info);
+}
 /**
 *  sys_tgkill - send signal to one specific thread
 *  @tgid: the thread group ID of the thread
@@ -2363,6 +2377,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
        return kill_proc_info(sig, &info, pid);
 }
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+        /* This is only valid for single tasks */
+        if (pid <= 0 || tgid <= 0)
+                return -EINVAL;
+        /* Not even root can pretend to send signals from the kernel.
+           Nor can they impersonate a kill(), which adds source info.  */
+        if (info->si_code >= 0)
+                return -EPERM;
+        info->si_signo = sig;
+        return do_send_specific(tgid, pid, sig, info);
+}
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+                siginfo_t __user *, uinfo)
+{
+        siginfo_t info;
+        if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+                return -EFAULT;
+        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct task_struct *t = current;
@@ -2414,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
        stack_t oss;
        int error;
-        if (uoss) {
+        oss.ss_sp = (void __user *) current->sas_ss_sp;
-                oss.ss_sp = (void __user *) current->sas_ss_sp;
+        oss.ss_size = current->sas_ss_size;
-                oss.ss_size = current->sas_ss_size;
+        oss.ss_flags = sas_ss_flags(sp);
-                oss.ss_flags = sas_ss_flags(sp);
-        }
        if (uss) {
                void __user *ss_sp;
@@ -2426,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
                int ss_flags;
                error = -EFAULT;
-                if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
+                if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
-                    || __get_user(ss_sp, &uss->ss_sp)
+                        goto out;
-                    || __get_user(ss_flags, &uss->ss_flags)
+                error = __get_user(ss_sp, &uss->ss_sp) |
-                    || __get_user(ss_size, &uss->ss_size))
+                        __get_user(ss_flags, &uss->ss_flags) |
+                        __get_user(ss_size, &uss->ss_size);
+                if (error)
                        goto out;
                error = -EPERM;
@@ -2461,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
                current->sas_ss_size = ss_size;
        }
+        error = 0;
        if (uoss) {
                error = -EFAULT;
-                if (copy_to_user(uoss, &oss, sizeof(oss)))
+                if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
                        goto out;
+                error = __put_user(oss.ss_sp, &uoss->ss_sp) |
+                        __put_user(oss.ss_size, &uoss->ss_size) |
+                        __put_user(oss.ss_flags, &uoss->ss_flags);
        }
-        error = 0;
 out:
        return error;
 }
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f43..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
 EXPORT_SYMBOL(slow_work_enqueue);
 /*
+ * Schedule a cull of the thread pool at some time in the near future
+ */
+static void slow_work_schedule_cull(void)
+{
+        mod_timer(&slow_work_cull_timer,
+                  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
+}
+/*
 * Worker thread culling algorithm
 */
 static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
                    list_empty(&vslow_work_queue) &&
                    atomic_read(&slow_work_thread_count) >
                    slow_work_min_threads) {
-                        mod_timer(&slow_work_cull_timer,
+                        slow_work_schedule_cull();
-                                  jiffies + SLOW_WORK_CULL_TIMEOUT);
                        do_cull = true;
                }
        }
@@ -372,8 +380,8 @@ static int slow_work_thread(void *_data)
                vsmax *= atomic_read(&slow_work_thread_count);
                vsmax /= 100;
-                prepare_to_wait(&slow_work_thread_wq, &wait,
+                prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
-                                TASK_INTERRUPTIBLE);
+                                          TASK_INTERRUPTIBLE);
                if (!freezing(current) &&
                    !slow_work_threads_should_exit &&
                    !slow_work_available(vsmax) &&
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
                            list_empty(&vslow_work_queue) &&
                            atomic_read(&slow_work_thread_count) >
                            slow_work_min_threads)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                        continue;
                }
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
                if (atomic_dec_and_test(&slow_work_thread_count))
                        BUG(); /* we're running on a slow work thread... */
                mod_timer(&slow_work_oom_timer,
-                          jiffies + SLOW_WORK_OOM_TIMEOUT);
+                          round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
        } else {
                /* ratelimit the starting of new threads */
                mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
                        if (n < 0 && !slow_work_may_not_start_new_thread)
                                slow_work_enqueue(&slow_work_new_thread);
                        else if (n > 0)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                }
                mutex_unlock(&slow_work_user_lock);
        }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
                                atomic_read(&slow_work_thread_count);
                        if (n < 0)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                }
                mutex_unlock(&slow_work_user_lock);
        }
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..94188b8ecc33 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        return NOTIFY_BAD;
                break;
-#ifdef CONFIG_CPU_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,9 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
-#include <trace/irq.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
 #include <asm/irq.h>
 /*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 */
 #define MAX_SOFTIRQ_RESTART 10
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
 asmlinkage void __do_softirq(void)
 {
        struct softirq_action *h;
@@ -214,6 +213,7 @@ restart:
        do {
                if (pending & 1) {
                        int prev_count = preempt_count();
+                        kstat_incr_softirqs_this_cpu(h - softirq_vec);
                        trace_softirq_entry(h, softirq_vec);
                        h->action(h);
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
        softirq_vec[nr].action = action;
 }
-/* Tasklets */
+/*
+ * Tasklets
+ */
 struct tasklet_head
 {
        struct tasklet_struct *head;
@@ -383,6 +385,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+        BUG_ON(!irqs_disabled());
+        t->next = __get_cpu_var(tasklet_hi_vec).head;
+        __get_cpu_var(tasklet_hi_vec).head = t;
+        __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 static void tasklet_action(struct softirq_action *a)
 {
        struct tasklet_struct *list;
@@ -482,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
 EXPORT_SYMBOL(tasklet_kill);
+/*
+ * tasklet_hrtimer
+ */
+/*
+ * The trampoline is called when the hrtimer expires. If this is
+ * called from the hrtimer interrupt then we schedule the tasklet as
+ * the timer callback function expects to run in softirq context. If
+ * it's called in softirq context anyway (i.e. high resolution timers
+ * disabled) then the hrtimer callback is called right away.
+ */
+static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
+{
+        struct tasklet_hrtimer *ttimer =
+                container_of(timer, struct tasklet_hrtimer, timer);
+        if (hrtimer_is_hres_active(timer)) {
+                tasklet_hi_schedule(&ttimer->tasklet);
+                return HRTIMER_NORESTART;
+        }
+        return ttimer->function(timer);
+}
+/*
+ * Helper function which calls the hrtimer callback from
+ * tasklet/softirq context
+ */
+static void __tasklet_hrtimer_trampoline(unsigned long data)
+{
+        struct tasklet_hrtimer *ttimer = (void *)data;
+        enum hrtimer_restart restart;
+        restart = ttimer->function(&ttimer->timer);
+        if (restart != HRTIMER_NORESTART)
+                hrtimer_restart(&ttimer->timer);
+}
+/**
+ * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
+ * @ttimer:      tasklet_hrtimer which is initialized
+ * @function:    hrtimer callback funtion which gets called from softirq context
+ * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
+ * @mode:        hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
+ */
+void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
+                          enum hrtimer_restart (*function)(struct hrtimer *),
+                          clockid_t which_clock, enum hrtimer_mode mode)
+{
+        hrtimer_init(&ttimer->timer, which_clock, mode);
+        ttimer->timer.function = __hrtimer_tasklet_trampoline;
+        tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
+                     (unsigned long)ttimer);
+        ttimer->function = function;
+}
+EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
+/*
+ * Remote softirq bits
+ */
 DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
 EXPORT_PER_CPU_SYMBOL(softirq_work_list);
@@ -828,7 +901,7 @@ int __init __weak arch_early_irq_init(void)
        return 0;
 }
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
 {
        return 0;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/perf_counter.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
@@ -1112,289 +1113,6 @@ out:
        return err;
 }
-/*
- * Supplementary group IDs
- */
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-struct group_info *groups_alloc(int gidsetsize)
-{
-        struct group_info *group_info;
-        int nblocks;
-        int i;
-        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
-        /* Make sure we always allocate at least one indirect block pointer */
-        nblocks = nblocks ? : 1;
-        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
-        if (!group_info)
-                return NULL;
-        group_info->ngroups = gidsetsize;
-        group_info->nblocks = nblocks;
-        atomic_set(&group_info->usage, 1);
-        if (gidsetsize <= NGROUPS_SMALL)
-                group_info->blocks[0] = group_info->small_block;
-        else {
-                for (i = 0; i < nblocks; i++) {
-                        gid_t *b;
-                        b = (void *)__get_free_page(GFP_USER);
-                        if (!b)
-                                goto out_undo_partial_alloc;
-                        group_info->blocks[i] = b;
-                }
-        }
-        return group_info;
-out_undo_partial_alloc:
-        while (--i >= 0) {
-                free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
-        return NULL;
-}
-EXPORT_SYMBOL(groups_alloc);
-void groups_free(struct group_info *group_info)
-{
-        if (group_info->blocks[0] != group_info->small_block) {
-                int i;
-                for (i = 0; i < group_info->nblocks; i++)
-                        free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
-}
-EXPORT_SYMBOL(groups_free);
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
-                          const struct group_info *group_info)
-{
-        int i;
-        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                unsigned int len = cp_count * sizeof(*grouplist);
-                if (copy_to_user(grouplist, group_info->blocks[i], len))
-                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
-        }
-        return 0;
-}
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
-    gid_t __user *grouplist)
-{
-        int i;
-        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                unsigned int len = cp_count * sizeof(*grouplist);
-                if (copy_from_user(group_info->blocks[i], grouplist, len))
-                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
-        }
-        return 0;
-}
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
-        int base, max, stride;
-        int gidsetsize = group_info->ngroups;
-        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-                ; /* nothing */
-        stride /= 3;
-        while (stride) {
-                max = gidsetsize - stride;
-                for (base = 0; base < max; base++) {
-                        int left = base;
-                        int right = left + stride;
-                        gid_t tmp = GROUP_AT(group_info, right);
-                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
-                                GROUP_AT(group_info, right) =
-                                    GROUP_AT(group_info, left);
-                                right = left;
-                                left -= stride;
-                        }
-                        GROUP_AT(group_info, right) = tmp;
-                }
-                stride /= 3;
-        }
-}
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
-        unsigned int left, right;
-        if (!group_info)
-                return 0;
-        left = 0;
-        right = group_info->ngroups;
-        while (left < right) {
-                unsigned int mid = (left+right)/2;
-                int cmp = grp - GROUP_AT(group_info, mid);
-                if (cmp > 0)
-                        left = mid + 1;
-                else if (cmp < 0)
-                        right = mid;
-                else
-                        return 1;
-        }
-        return 0;
-}
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
-        int retval;
-        retval = security_task_setgroups(group_info);
-        if (retval)
-                return retval;
-        put_group_info(new->group_info);
-        groups_sort(group_info);
-        get_group_info(group_info);
-        new->group_info = group_info;
-        return 0;
-}
-EXPORT_SYMBOL(set_groups);
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
-        struct cred *new;
-        int ret;
-        new = prepare_creds();
-        if (!new)
-                return -ENOMEM;
-        ret = set_groups(new, group_info);
-        if (ret < 0) {
-                abort_creds(new);
-                return ret;
-        }
-        return commit_creds(new);
-}
-EXPORT_SYMBOL(set_current_groups);
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-        const struct cred *cred = current_cred();
-        int i;
-        if (gidsetsize < 0)
-                return -EINVAL;
-        /* no need to grab task_lock here; it cannot change */
-        i = cred->group_info->ngroups;
-        if (gidsetsize) {
-                if (i > gidsetsize) {
-                        i = -EINVAL;
-                        goto out;
-                }
-                if (groups_to_user(grouplist, cred->group_info)) {
-                        i = -EFAULT;
-                        goto out;
-                }
-        }
-out:
-        return i;
-}
-/*
- *      SMP: Our groups are copy-on-write. We can set them safely
- *      without another task interfering.
- */
- 
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-        struct group_info *group_info;
-        int retval;
-        if (!capable(CAP_SETGID))
-                return -EPERM;
-        if ((unsigned)gidsetsize > NGROUPS_MAX)
-                return -EINVAL;
-        group_info = groups_alloc(gidsetsize);
-        if (!group_info)
-                return -ENOMEM;
-        retval = groups_from_user(group_info, grouplist);
-        if (retval) {
-                put_group_info(group_info);
-                return retval;
-        }
-        retval = set_current_groups(group_info);
-        put_group_info(group_info);
-        return retval;
-}
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
-        const struct cred *cred = current_cred();
-        int retval = 1;
-        if (grp != cred->fsgid)
-                retval = groups_search(cred->group_info, grp);
-        return retval;
-}
-EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(gid_t grp)
-{
-        const struct cred *cred = current_cred();
-        int retval = 1;
-        if (grp != cred->egid)
-                retval = groups_search(cred->group_info, grp);
-        return retval;
-}
-EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
@@ -1793,6 +1511,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_TSC:
                        error = SET_TSC_CTL(arg2);
                        break;
+                case PR_TASK_PERF_COUNTERS_DISABLE:
+                        error = perf_counter_task_disable();
+                        break;
+                case PR_TASK_PERF_COUNTERS_ENABLE:
+                        error = perf_counter_task_enable();
+                        break;
                case PR_GET_TIMERSLACK:
                        error = current->timer_slack_ns;
                        break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d56fb76..58be76017fd0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
+#include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -48,7 +49,9 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/security.h>
 #include <linux/slow-work.h>
+#include <linux/perf_counter.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -114,6 +117,7 @@ static int ngroups_max = NGROUPS_MAX;
 #ifdef CONFIG_MODULES
 extern char modprobe_path[];
+extern int modules_disabled;
 #endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
@@ -326,6 +330,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "timer_migration",
+                .data           = &sysctl_timer_migration,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
 #endif
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -534,6 +549,17 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "modules_disabled",
+                .data           = &modules_disabled,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                /* only handle a transition from default "0" to "1" */
+                .proc_handler   = &proc_dointvec_minmax,
+                .extra1         = &one,
+                .extra2         = &one,
+        },
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
        {
@@ -722,6 +748,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "panic_on_io_nmi",
+                .data           = &panic_on_io_nmi,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = KERN_BOOTLOADER_TYPE,
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
@@ -731,6 +765,14 @@ static struct ctl_table kern_table[] = {
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "bootloader_version",
+                .data           = &bootloader_version,
+                .maxlen         = sizeof (int),
+                .mode           = 0444,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "kstack_depth_to_print",
                .data           = &kstack_depth_to_print,
                .maxlen         = sizeof(int),
@@ -912,6 +954,43 @@ static struct ctl_table kern_table[] = {
                .child          = slow_work_sysctls,
        },
 #endif
+#ifdef CONFIG_PERF_COUNTERS
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "perf_counter_paranoid",
+                .data           = &sysctl_perf_counter_paranoid,
+                .maxlen         = sizeof(sysctl_perf_counter_paranoid),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "perf_counter_mlock_kb",
+                .data           = &sysctl_perf_counter_mlock,
+                .maxlen         = sizeof(sysctl_perf_counter_mlock),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "perf_counter_max_sample_rate",
+                .data           = &sysctl_perf_counter_sample_rate,
+                .maxlen         = sizeof(sysctl_perf_counter_sample_rate),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
+#ifdef CONFIG_KMEMCHECK
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "kmemcheck",
+                .data           = &kmemcheck_enabled,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1225,16 +1304,14 @@ static struct ctl_table vm_table[] = {
                .strategy       = &sysctl_jiffies,
        },
 #endif
-#ifdef CONFIG_SECURITY
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "mmap_min_addr",
-                .data           = &mmap_min_addr,
+                .data           = &dac_mmap_min_addr,
-                .maxlen         = sizeof(unsigned long),
+                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = &mmap_min_addr_handler,
        },
-#endif
 #ifdef CONFIG_NUMA
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -1272,7 +1349,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "scan_unevictable_pages",
@@ -1281,7 +1357,6 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &scan_unevictable_handler,
        },
-#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2220,7 +2295,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
                  void *data)
 {
 #define TMPBUFLEN 21
-        int *i, vleft, first=1, neg, val;
+        int *i, vleft, first = 1, neg;
        unsigned long lval;
        size_t left, len;
        
@@ -2273,8 +2348,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
                        len = p-buf;
                        if ((len < left) && *p && !isspace(*p))
                                break;
-                        if (neg)
-                                val = -val;
                        s += len;
                        left -= len;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a790..620b58abdc32 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
+#include <linux/tick.h>
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
        return (unsigned long) clc;
 }
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 /**
 * clockevents_set_mode - set the operating mode of a clock event device
@@ -135,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 */
 int clockevents_register_notifier(struct notifier_block *nb)
 {
+        unsigned long flags;
        int ret;
-        spin_lock(&clockevents_lock);
+        spin_lock_irqsave(&clockevents_lock, flags);
        ret = raw_notifier_chain_register(&clockevents_chain, nb);
-        spin_unlock(&clockevents_lock);
+        spin_unlock_irqrestore(&clockevents_lock, flags);
        return ret;
 }
@@ -176,17 +179,20 @@ static void clockevents_notify_released(void)
 */
 void clockevents_register_device(struct clock_event_device *dev)
 {
+        unsigned long flags;
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
        BUG_ON(!dev->cpumask);
-        spin_lock(&clockevents_lock);
+        spin_lock_irqsave(&clockevents_lock, flags);
        list_add(&dev->list, &clockevent_devices);
        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
        clockevents_notify_released();
-        spin_unlock(&clockevents_lock);
+        spin_unlock_irqrestore(&clockevents_lock, flags);
 }
+EXPORT_SYMBOL_GPL(clockevents_register_device);
 /*
 * Noop handler when we shut down an event device
@@ -232,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
 void clockevents_notify(unsigned long reason, void *arg)
 {
        struct list_head *node, *tmp;
+        unsigned long flags;
-        spin_lock(&clockevents_lock);
+        spin_lock_irqsave(&clockevents_lock, flags);
        clockevents_do_notify(reason, arg);
        switch (reason) {
@@ -248,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
        default:
                break;
        }
-        spin_unlock(&clockevents_lock);
+        spin_unlock_irqrestore(&clockevents_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 #endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e0..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
        unsigned long flags;
        int ret;
-        /* save mult_orig on registration */
-        c->mult_orig = c->mult;
        spin_lock_irqsave(&clocksource_lock, flags);
        ret = clocksource_enqueue(c);
        if (!ret)
@@ -512,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                }
        }
+        /*
+         * Check to make sure we don't switch to a non-highres capable
+         * clocksource if the tick code is in oneshot mode (highres or nohz)
+         */
+        if (tick_oneshot_mode_active() && ovr &&
+            !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
+                printk(KERN_WARNING "%s clocksource is not HRT compatible. "
+                        "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
+                ovr = NULL;
+                override_name[0] = 0;
+        }
        /* Reselect, when the override name has changed */
        if (ovr != clocksource_override) {
                clocksource_override = ovr;
@@ -540,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
        spin_lock_irq(&clocksource_lock);
        list_for_each_entry(src, &clocksource_list, list) {
-                count += snprintf(buf + count,
+                /*
+                 * Don't show non-HRES clocksource if the tick code is
+                 * in one shot mode (highres=on or nohz=on)
+                 */
+                if (!tick_oneshot_mode_active() ||
+                    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+                        count += snprintf(buf + count,
                                  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
                                  "%s ", src->name);
        }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9a..c2ec25087a35 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
 * timer stops in C3 state.
 */
-struct tick_device tick_broadcast_device;
+static struct tick_device tick_broadcast_device;
 /* FIXME: Use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
 static DECLARE_BITMAP(tmpmask, NR_CPUS);
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 * Powerstate information: The system enters/leaves a state, where
 * affected devices might stop
 */
-static void tick_do_broadcast_on_off(void *why)
+static void tick_do_broadcast_on_off(unsigned long *reason)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-        unsigned long flags, *reason = why;
+        unsigned long flags;
        int cpu, bc_stopped;
        spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
                printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
                       "offline CPU #%d\n", *oncpu);
        else
-                smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
+                tick_do_broadcast_on_off(&reason);
-                                         &reason, 1);
 }
 /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e767..a96c0e2b89cf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
        return 0;
 }
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+        unsigned long flags;
+        int ret;
+        local_irq_save(flags);
+        ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+        local_irq_restore(flags);
+        return ret;
+}
 #ifdef CONFIG_HIGH_RES_TIMERS
 /**
 * tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cbe..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
+        /*
+         * Call to tick_nohz_start_idle stops the last_update_time from being
+         * updated. Thus, it must not be called in the event we are called from
+         * irq_exit() with the prior state different than idle.
+         */
+        if (!inidle && !ts->inidle)
+                goto end;
        now = tick_nohz_start_idle(ts);
        /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
                goto end;
-        if (!inidle && !ts->inidle)
-                goto end;
        ts->inidle = 1;
        if (need_resched())
@@ -349,7 +355,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
-                                      HRTIMER_MODE_ABS);
+                                      HRTIMER_MODE_ABS_PINNED);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                goto out;
@@ -395,7 +401,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start_expires(&ts->sched_timer,
-                                      HRTIMER_MODE_ABS);
+                                              HRTIMER_MODE_ABS_PINNED);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                break;
@@ -698,7 +704,8 @@ void tick_setup_sched_timer(void)
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
-                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+                hrtimer_start_expires(&ts->sched_timer,
+                                      HRTIMER_MODE_ABS_PINNED);
                /* Check, if the timer was already in the past */
                if (hrtimer_active(&ts->sched_timer))
                        break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..e8c77d9c633a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 /*
 * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
 */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
        clock->cycle_last = cycle_now;
        nsec = cyc2ns(clock, cycle_delta);
+        /* If arch requires, add in gettimeoffset() */
+        nsec += arch_gettimeoffset();
        timespec_add_ns(&xtime, nsec);
        nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
                /* convert to nanoseconds: */
                nsecs = cyc2ns(clock, cycle_delta);
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&xtime_lock, seq));
        timespec_add_ns(ts, nsecs);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a1277..fddd69d16e03 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
+        pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
        if (!pe)
                return -ENOMEM;
        return 0;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
 /*
 * Collection status, active/inactive:
 */
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
 /*
 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        struct entry *entry, input;
        unsigned long flags;
-        if (likely(!active))
+        if (likely(!timer_stats_active))
                return;
        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        input.timer_flag = timer_flag;
        spin_lock_irqsave(lock, flags);
-        if (!active)
+        if (!timer_stats_active)
                goto out_unlock;
        entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
        /*
         * If still active then calculate up to now:
         */
-        if (active)
+        if (timer_stats_active)
                time_stop = ktime_get();
        time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
        mutex_lock(&show_mutex);
        switch (ctl[0]) {
        case '0':
-                if (active) {
+                if (timer_stats_active) {
-                        active = 0;
+                        timer_stats_active = 0;
                        time_stop = ktime_get();
                        sync_access();
                }
                break;
        case '1':
-                if (!active) {
+                if (!timer_stats_active) {
                        reset_entries();
                        time_start = ktime_get();
                        smp_mb();
-                        active = 1;
+                        timer_stats_active = 1;
                }
                break;
        default:
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,8 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -378,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
 {
        unsigned int flag = 0;
+        if (likely(!timer->start_site))
+                return;
        if (unlikely(tbase_get_deferrable(timer->base)))
                flag |= TIMER_STATS_FLAG_DEFERRABLE;
@@ -604,13 +608,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires,
+                                                bool pending_only, int pinned)
 {
        struct tvec_base *base, *new_base;
        unsigned long flags;
-        int ret;
+        int ret = 0 , cpu;
-        ret = 0;
        timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
@@ -629,6 +632,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
        new_base = __get_cpu_var(tvec_bases);
+        cpu = smp_processor_id();
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+        if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+                int preferred_cpu = get_nohz_load_balancer();
+                if (preferred_cpu >= 0)
+                        cpu = preferred_cpu;
+        }
+#endif
+        new_base = per_cpu(tvec_bases, cpu);
        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the local CPU.
@@ -668,7 +683,7 @@ out_unlock:
 */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-        return __mod_timer(timer, expires, true);
+        return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
@@ -699,14 +714,36 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
         * networking code - if the timer is re-modified
         * to be the same thing then just return:
         */
-        if (timer->expires == expires && timer_pending(timer))
+        if (timer_pending(timer) && timer->expires == expires)
                return 1;
-        return __mod_timer(timer, expires, false);
+        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 /**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and not allow the timer to be migrated to a different CPU.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ *     del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
+        if (timer->expires == expires && timer_pending(timer))
+                return 1;
+        return __mod_timer(timer, expires, false, TIMER_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pinned);
+/**
 * add_timer - start a timer
 * @timer: the timer to be added
 *
@@ -756,6 +793,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        wake_up_idle_cpu(cpu);
        spin_unlock_irqrestore(&base->lock, flags);
 }
+EXPORT_SYMBOL_GPL(add_timer_on);
 /**
 * del_timer - deactive a timer.
@@ -1015,6 +1053,9 @@ cascade:
                index = slot = timer_jiffies & TVN_MASK;
                do {
                        list_for_each_entry(nte, varp->vec + slot, entry) {
+                                if (tbase_get_deferrable(nte->base))
+                                        continue;
                                found = 1;
                                if (time_before(nte->expires, expires))
                                        expires = nte->expires;
@@ -1123,53 +1164,14 @@ void update_process_times(int user_tick)
 }
 /*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-        return nr_active() * FIXED_1;
-}
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-        unsigned long active_tasks; /* fixed-point */
-        static int count = LOAD_FREQ;
-        count -= ticks;
-        if (unlikely(count < 0)) {
-                active_tasks = count_active_tasks();
-                do {
-                        CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-                        CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-                        CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-                        count += LOAD_FREQ;
-                } while (count < 0);
-        }
-}
-/*
 * This function runs timers and the timer-tq in bottom half context.
 */
 static void run_timer_softirq(struct softirq_action *h)
 {
        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        perf_counter_do_pending();
        hrtimer_run_pending();
        if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1187,16 +1189,6 @@ void run_local_timers(void)
 }
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-        update_wall_time();
-        calc_load(ticks);
-}
-/*
 * The 64-bit jiffies value is not atomic - you MUST NOT read it
 * without sampling the sequence number in xtime_lock.
 * jiffies is defined in the linker script...
@@ -1205,7 +1197,8 @@ static inline void update_times(unsigned long ticks)
 void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
-        update_times(ticks);
+        update_wall_time();
+        calc_global_load();
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1353,7 +1346,7 @@ signed long __sched schedule_timeout(signed long timeout)
        expire = timeout + jiffies;
        setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-        __mod_timer(&timer, expire, false);
+        __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
        schedule();
        del_singleshot_timer_sync(&timer);
@@ -1406,37 +1399,17 @@ int do_sysinfo(struct sysinfo *info)
 {
        unsigned long mem_total, sav_total;
        unsigned int mem_unit, bitcount;
-        unsigned long seq;
+        struct timespec tp;
        memset(info, 0, sizeof(struct sysinfo));
-        do {
+        ktime_get_ts(&tp);
-                struct timespec tp;
+        monotonic_to_bootbased(&tp);
-                seq = read_seqbegin(&xtime_lock);
+        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-                /*
-                 * This is annoying.  The below is the same thing
-                 * posix_get_clock_monotonic() does, but it wants to
-                 * take the lock which we want to cover the loads stuff
-                 * too.
-                 */
-                getnstimeofday(&tp);
-                tp.tv_sec += wall_to_monotonic.tv_sec;
-                tp.tv_nsec += wall_to_monotonic.tv_nsec;
-                monotonic_to_bootbased(&tp);
-                if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-                        tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-                        tp.tv_sec++;
-                }
-                info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-                info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
-                info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-                info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
-                info->procs = nr_threads;
+        info->procs = nr_threads;
-        } while (read_seqretry(&xtime_lock, seq));
        si_meminfo(info);
        si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 417d1985e299..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
 config HAVE_FUNCTION_GRAPH_TRACER
        bool
+config HAVE_FUNCTION_GRAPH_FP_TEST
+        bool
+        help
+         An arch may pass in a unique value (frame pointer) to both the
+         entering and exiting of a function. On exit, the value is compared
+         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
        help
@@ -48,6 +55,21 @@ config FTRACE_NMI_ENTER
       depends on HAVE_FTRACE_NMI_ENTER
       default y
+config EVENT_TRACING
+        select CONTEXT_SWITCH_TRACER
+        bool
+config CONTEXT_SWITCH_TRACER
+        select MARKERS
+        bool
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
 config TRACING
        bool
        select DEBUG_FS
@@ -56,6 +78,11 @@ config TRACING
        select TRACEPOINTS
        select NOP_TRACER
        select BINARY_PRINTF
+        select EVENT_TRACING
+config GENERIC_TRACER
+        bool
+        select TRACING
 #
 # Minimum requirements an architecture has to meet for us to
@@ -73,14 +100,20 @@ config TRACING_SUPPORT
 if TRACING_SUPPORT
-menu "Tracers"
+menuconfig FTRACE
+        bool "Tracers"
+        default y if DEBUG_KERNEL
+        help
+         Enable the kernel tracing infrastructure.
+if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
        select FRAME_POINTER
        select KALLSYMS
-        select TRACING
+        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        help
          Enable the kernel to trace every kernel function. This is done
@@ -95,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
        bool "Kernel Function Graph Tracer"
        depends on HAVE_FUNCTION_GRAPH_TRACER
        depends on FUNCTION_TRACER
+        depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
        default y
        help
          Enable the kernel to trace a function at both its return
@@ -104,13 +138,14 @@ config FUNCTION_GRAPH_TRACER
          the return value. This is done by setting the current return 
          address on the current task structure into a stack of calls.
 config IRQSOFF_TRACER
        bool "Interrupts-off Latency Tracer"
        default n
        depends on TRACE_IRQFLAGS_SUPPORT
        depends on GENERIC_TIME
        select TRACE_IRQFLAGS
-        select TRACING
+        select GENERIC_TRACER
        select TRACER_MAX_TRACE
        help
          This option measures the time spent in irqs-off critical
@@ -120,7 +155,7 @@ config IRQSOFF_TRACER
          disabled by default and can be runtime (re-)started
          via:
-              echo 0 > /debugfs/tracing/tracing_max_latency
+              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
          (Note that kernel size and overhead increases with this option
          enabled. This option and the preempt-off timing option can be
@@ -131,7 +166,7 @@ config PREEMPT_TRACER
        default n
        depends on GENERIC_TIME
        depends on PREEMPT
-        select TRACING
+        select GENERIC_TRACER
        select TRACER_MAX_TRACE
        help
          This option measures the time spent in preemption off critical
@@ -141,7 +176,7 @@ config PREEMPT_TRACER
          disabled by default and can be runtime (re-)started
          via:
-              echo 0 > /debugfs/tracing/tracing_max_latency
+              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
          (Note that kernel size and overhead increases with this option
          enabled. This option and the irqs-off timing option can be
@@ -150,7 +185,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
        bool "Sysprof Tracer"
        depends on X86
-        select TRACING
+        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        help
          This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,83 +193,103 @@ config SYSPROF_TRACER
 config SCHED_TRACER
        bool "Scheduling Latency Tracer"
-        select TRACING
+        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        select TRACER_MAX_TRACE
        help
          This tracer tracks the latency of the highest priority task
          to be scheduled in, starting from the point it has woken up.
-config CONTEXT_SWITCH_TRACER
+config ENABLE_DEFAULT_TRACERS
-        bool "Trace process context switches"
+        bool "Trace process context switches and events"
-        select TRACING
+        depends on !GENERIC_TRACER
-        select MARKERS
-        help
-          This tracer gets called from the context switch and records
-          all switching of tasks.
-config EVENT_TRACER
-        bool "Trace various events in the kernel"
        select TRACING
        help
          This tracer hooks to various trace points in the kernel
          allowing the user to pick and choose which trace point they
-          want to trace.
+          want to trace. It also includes the sched_switch tracer plugin.
 config FTRACE_SYSCALLS
        bool "Trace syscalls"
        depends on HAVE_FTRACE_SYSCALLS
-        select TRACING
+        select GENERIC_TRACER
        select KALLSYMS
        help
          Basic tracer to catch the syscall entry and exit events.
 config BOOT_TRACER
        bool "Trace boot initcalls"
-        select TRACING
+        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        help
          This tracer helps developers to optimize boot times: it records
          the timings of the initcalls and traces key events and the identity
          of tasks that can cause boot delays, such as context-switches.
-          Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+          Its aim is to be parsed by the scripts/bootgraph.pl tool to
          produce pretty graphics about boot inefficiencies, giving a visual
          representation of the delays during initcalls - but the raw
          /debug/tracing/trace text output is readable too.
-          You must pass in ftrace=initcall to the kernel command line
+          You must pass in initcall_debug and ftrace=initcall to the kernel
-          to enable this on bootup.
+          command line to enable this on bootup.
 config TRACE_BRANCH_PROFILING
+        bool
+        select GENERIC_TRACER
+choice
+        prompt "Branch Profiling"
+        default BRANCH_PROFILE_NONE
+        help
+         The branch profiling is a software profiler. It will add hooks
+         into the C conditionals to test which path a branch takes.
+         The likely/unlikely profiler only looks at the conditions that
+         are annotated with a likely or unlikely macro.
+         The "all branch" profiler will profile every if statement in the
+         kernel. This profiler will also enable the likely/unlikely
+         profiler as well.
+         Either of the above profilers add a bit of overhead to the system.
+         If unsure choose "No branch profiling".
+config BRANCH_PROFILE_NONE
+        bool "No branch profiling"
+        help
+         No branch profiling. Branch profiling adds a bit of overhead.
+         Only enable it if you want to analyse the branching behavior.
+         Otherwise keep it disabled.
+config PROFILE_ANNOTATED_BRANCHES
        bool "Trace likely/unlikely profiler"
-        select TRACING
+        select TRACE_BRANCH_PROFILING
        help
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
-          /debugfs/tracing/profile_annotated_branch
+          /sys/kernel/debug/tracing/profile_annotated_branch
          Note: this will add a significant overhead, only turn this
          on if you need to profile the system's use of these macros.
-          Say N if unsure.
 config PROFILE_ALL_BRANCHES
        bool "Profile all if conditionals"
-        depends on TRACE_BRANCH_PROFILING
+        select TRACE_BRANCH_PROFILING
        help
          This tracer profiles all branch conditions. Every if ()
          taken in the kernel is recorded whether it hit or miss.
          The results will be displayed in:
-          /debugfs/tracing/profile_branch
+          /sys/kernel/debug/tracing/profile_branch
+          This option also enables the likely/unlikely profiler.
          This configuration, when enabled, will impose a great overhead
          on the system. This should only be enabled when the system
          is to be analyzed
+endchoice
-          Say N if unsure.
 config TRACING_BRANCHES
        bool
@@ -261,7 +316,7 @@ config BRANCH_TRACER
 config POWER_TRACER
        bool "Trace power consumption behavior"
        depends on X86
-        select TRACING
+        select GENERIC_TRACER
        help
          This tracer helps developers to analyze and optimize the kernels
          power management decisions, specifically the C-state and P-state
@@ -276,7 +331,7 @@ config STACK_TRACER
        select KALLSYMS
        help
          This special tracer records the maximum stack footprint of the
-          kernel and displays it in debugfs/tracing/stack_trace.
+          kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
          This tracer works by hooking into every function call that the
          kernel executes, and keeping a maximum stack depth value and
@@ -295,14 +350,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
        depends on HAVE_HW_BRANCH_TRACER
        bool "Trace hw branches"
-        select TRACING
+        select GENERIC_TRACER
        help
          This tracer records all branches on the system in a circular
          buffer giving access to the last N branches for each cpu.
 config KMEMTRACE
        bool "Trace SLAB allocations"
-        select TRACING
+        select GENERIC_TRACER
        help
          kmemtrace provides tracing for slab allocator functions, such as
          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +377,7 @@ config KMEMTRACE
 config WORKQUEUE_TRACER
        bool "Trace workqueues"
-        select TRACING
+        select GENERIC_TRACER
        help
          The workqueue tracer provides some statistical informations
          about each cpu workqueue thread such as the number of the
@@ -338,7 +393,7 @@ config BLK_DEV_IO_TRACE
        select RELAY
        select DEBUG_FS
        select TRACEPOINTS
-        select TRACING
+        select GENERIC_TRACER
        select STACKTRACE
        help
          Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +430,20 @@ config DYNAMIC_FTRACE
         were made. If so, it runs stop_machine (stops all CPUS)
         and modifies the code to jump over the call to ftrace.
+config FUNCTION_PROFILER
+        bool "Kernel function profiler"
+        depends on FUNCTION_TRACER
+        default n
+        help
+         This option enables the kernel function profiler. A file is created
+         in debugfs called function_profile_enabled which defaults to zero.
+         When a 1 is echoed into this file profiling begins, and when a
+         zero is entered, profiling stops. A file in the trace_stats
+         directory called functions, that show the list of functions that
+         have been hit and their counters.
+         If in doubt, say N
 config FTRACE_MCOUNT_RECORD
        def_bool y
        depends on DYNAMIC_FTRACE
@@ -385,7 +454,7 @@ config FTRACE_SELFTEST
 config FTRACE_STARTUP_TEST
        bool "Perform a startup test on ftrace"
-        depends on TRACING
+        depends on GENERIC_TRACER
        select FTRACE_SELFTEST
        help
          This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +465,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
        bool "Memory mapped IO tracing"
        depends on HAVE_MMIOTRACE_SUPPORT && PCI
-        select TRACING
+        select GENERIC_TRACER
        help
          Mmiotrace traces Memory Mapped I/O access and is meant for
          debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +485,23 @@ config MMIOTRACE_TEST
          Say N, unless you absolutely know what you are doing.
-endmenu
+config RING_BUFFER_BENCHMARK
+        tristate "Ring buffer benchmark stress tester"
+        depends on RING_BUFFER
+        help
+          This option creates a test to stress the ring buffer and bench mark it.
+          It creates its own ring buffer such that it will not interfer with
+          any other users of the ring buffer (such as ftrace). It then creates
+          a producer and consumer that will run for 10 seconds and sleep for
+          10 seconds. Each interval it will print out the number of events
+          it recorded and give a rough estimate of how long each iteration took.
+          It does not disable interrupts or raise its priority, so it may be
+          affected by processes that are running.
+          If unsure, say N
+endif # FTRACE
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec1..844164dca90a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)  += blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+ifeq ($(CONFIG_BLOCK),y)
-obj-$(CONFIG_EVENT_TRACER) += events.o
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0ba..7a34cb563fec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,11 +22,16 @@
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
+#include <linux/smp_lock.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+#include <trace/events/block.h>
 #include "trace_output.h"
+#ifdef CONFIG_BLK_DEV_IO_TRACE
 static unsigned int blktrace_seq __read_mostly = 1;
 static struct trace_array *blk_tr;
@@ -147,7 +152,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
        if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
                return 1;
-        if (sector < bt->start_lba || sector > bt->end_lba)
+        if (sector && (sector < bt->start_lba || sector > bt->end_lba))
                return 1;
        if (bt->pid && pid != bt->pid)
                return 1;
@@ -192,7 +197,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(rw, DISCARD);
        pid = tsk->pid;
-        if (unlikely(act_log_check(bt, what, sector, pid)))
+        if (act_log_check(bt, what, sector, pid))
                return;
        cpu = raw_smp_processor_id();
@@ -263,6 +268,7 @@ static void blk_trace_free(struct blk_trace *bt)
        debugfs_remove(bt->msg_file);
        debugfs_remove(bt->dropped_file);
        relay_close(bt->rchan);
+        debugfs_remove(bt->dir);
        free_percpu(bt->sequence);
        free_percpu(bt->msg_data);
        kfree(bt);
@@ -372,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 static int blk_remove_buf_file_callback(struct dentry *dentry)
 {
-        struct dentry *parent = dentry->d_parent;
        debugfs_remove(dentry);
-        /*
-        * this will fail for all but the last file, but that is ok. what we
-        * care about is the top level buts->name directory going away, when
-        * the last trace file is gone. Then we don't have to rmdir() that
-        * manually on trace stop, so it nicely solves the issue with
-        * force killing of running traces.
-        */
-        debugfs_remove(parent);
        return 0;
 }
@@ -403,11 +399,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
        .remove_buf_file        = blk_remove_buf_file_callback,
 };
+static void blk_trace_setup_lba(struct blk_trace *bt,
+                                struct block_device *bdev)
+{
+        struct hd_struct *part = NULL;
+        if (bdev)
+                part = bdev->bd_part;
+        if (part) {
+                bt->start_lba = part->start_sect;
+                bt->end_lba = part->start_sect + part->nr_sects;
+        } else {
+                bt->start_lba = 0;
+                bt->end_lba = -1ULL;
+        }
+}
 /*
 * Setup everything required to start tracing
 */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-                        struct blk_user_trace_setup *buts)
+                       struct block_device *bdev,
+                       struct blk_user_trace_setup *buts)
 {
        struct blk_trace *old_bt, *bt = NULL;
        struct dentry *dir = NULL;
@@ -480,10 +494,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (!bt->act_mask)
                bt->act_mask = (u16) -1;
-        bt->start_lba = buts->start_lba;
+        blk_trace_setup_lba(bt, bdev);
-        bt->end_lba = buts->end_lba;
-        if (!bt->end_lba)
+        /* overwrite with user settings */
-                bt->end_lba = -1ULL;
+        if (buts->start_lba)
+                bt->start_lba = buts->start_lba;
+        if (buts->end_lba)
+                bt->end_lba = buts->end_lba;
        bt->pid = buts->pid;
        bt->trace_state = Blktrace_setup;
@@ -505,6 +522,7 @@ err:
 }
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+                    struct block_device *bdev,
                    char __user *arg)
 {
        struct blk_user_trace_setup buts;
@@ -514,7 +532,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (ret)
                return -EFAULT;
-        ret = do_blk_trace_setup(q, name, dev, &buts);
+        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
        if (ret)
                return ret;
@@ -582,7 +600,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
        switch (cmd) {
        case BLKTRACESETUP:
                bdevname(bdev, b);
-                ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+                ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
                break;
        case BLKTRACESTART:
                start = 1;
@@ -642,12 +660,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        if (blk_pc_request(rq)) {
                what |= BLK_TC_ACT(BLK_TC_PC);
-                __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
-                                rq->cmd_len, rq->cmd);
+                                what, rq->errors, rq->cmd_len, rq->cmd);
        } else  {
                what |= BLK_TC_ACT(BLK_TC_FS);
-                __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
-                                rw, what, rq->errors, 0, NULL);
+                                what, rq->errors, 0, NULL);
        }
 }
@@ -809,7 +827,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
 * @bio:        the source bio
 * @dev:        target device
 * @from:       source sector
- * @to:         target sector
 *
 * Description:
 *     Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +834,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
 *
 **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-                                       dev_t dev, sector_t from, sector_t to)
+                                       dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -825,12 +842,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
        if (likely(!bt))
                return;
-        r.device = cpu_to_be32(dev);
+        r.device_from = cpu_to_be32(dev);
-        r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+        r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
-        r.sector = cpu_to_be64(to);
+        r.sector_from = cpu_to_be64(from);
-        __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+        __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-                        !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+                        BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+                        sizeof(r), &r);
 }
 /**
@@ -854,11 +872,11 @@ void blk_add_driver_data(struct request_queue *q,
                return;
        if (blk_pc_request(rq))
-                __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+                __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
-                                rq->errors, len, data);
+                                BLK_TA_DRV_DATA, rq->errors, len, data);
        else
-                __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
-                                0, BLK_TA_DRV_DATA, rq->errors, len, data);
+                                BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -971,6 +989,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
        return te_blk_io_trace(ent) + 1;
 }
+static inline u32 t_action(const struct trace_entry *ent)
+{
+        return te_blk_io_trace(ent)->action;
+}
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+        return te_blk_io_trace(ent)->bytes;
+}
 static inline u32 t_sec(const struct trace_entry *ent)
 {
        return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1024,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
                          struct blk_io_trace_remap *r)
 {
        const struct blk_io_trace_remap *__r = pdu_start(ent);
-        __u64 sector = __r->sector;
+        __u64 sector_from = __r->sector_from;
-        r->device = be32_to_cpu(__r->device);
        r->device_from = be32_to_cpu(__r->device_from);
-        r->sector = be64_to_cpu(sector);
+        r->device_to   = be32_to_cpu(__r->device_to);
+        r->sector_from = be64_to_cpu(sector_from);
 }
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1059,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
                                MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+        const unsigned char *pdu_buf;
+        int pdu_len;
+        int i, end, ret;
+        pdu_buf = pdu_start(ent);
+        pdu_len = te_blk_io_trace(ent)->pdu_len;
+        if (!pdu_len)
+                return 1;
+        /* find the last zero that needs to be printed */
+        for (end = pdu_len - 1; end >= 0; end--)
+                if (pdu_buf[end])
+                        break;
+        end++;
+        if (!trace_seq_putc(s, '('))
+                return 0;
+        for (i = 0; i < pdu_len; i++) {
+                ret = trace_seq_printf(s, "%s%02x",
+                                       i == 0 ? "" : " ", pdu_buf[i]);
+                if (!ret)
+                        return ret;
+                /*
+                 * stop when the rest is just zeroes and indicate so
+                 * with a ".." appended
+                 */
+                if (i == end && end != pdu_len - 1)
+                        return trace_seq_puts(s, " ..) ");
+        }
+        return trace_seq_puts(s, ") ");
+}
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
        char cmd[TASK_COMM_LEN];
        trace_find_cmdline(ent->pid, cmd);
-        if (t_sec(ent))
+        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-                return trace_seq_printf(s, "%llu + %u [%s]\n",
+                int ret;
-                                        t_sector(ent), t_sec(ent), cmd);
-        return trace_seq_printf(s, "[%s]\n", cmd);
+                ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+                if (!ret)
+                        return 0;
+                ret = blk_log_dump_pdu(s, ent);
+                if (!ret)
+                        return 0;
+                return trace_seq_printf(s, "[%s]\n", cmd);
+        } else {
+                if (t_sec(ent))
+                        return trace_seq_printf(s, "%llu + %u [%s]\n",
+                                                t_sector(ent), t_sec(ent), cmd);
+                return trace_seq_printf(s, "[%s]\n", cmd);
+        }
 }
 static int blk_log_with_error(struct trace_seq *s,
                              const struct trace_entry *ent)
 {
-        if (t_sec(ent))
+        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-                return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
+                int ret;
-                                        t_sec(ent), t_error(ent));
-        return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+                ret = blk_log_dump_pdu(s, ent);
+                if (ret)
+                        return trace_seq_printf(s, "[%d]\n", t_error(ent));
+                return 0;
+        } else {
+                if (t_sec(ent))
+                        return trace_seq_printf(s, "%llu + %u [%d]\n",
+                                                t_sector(ent),
+                                                t_sec(ent), t_error(ent));
+                return trace_seq_printf(s, "%llu [%d]\n",
+                                        t_sector(ent), t_error(ent));
+        }
 }
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
-        struct blk_io_trace_remap r = { .device = 0, };
+        struct blk_io_trace_remap r = { .device_from = 0, };
        get_pdu_remap(ent, &r);
        return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-                               t_sector(ent),
+                                t_sector(ent), t_sec(ent),
-                               t_sec(ent), MAJOR(r.device), MINOR(r.device),
+                                MAJOR(r.device_from), MINOR(r.device_from),
-                               (unsigned long long)r.sector);
+                                (unsigned long long)r.sector_from);
 }
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1207,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 static void blk_tracer_start(struct trace_array *tr)
 {
        blk_tracer_enabled = true;
-        trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1219,6 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
        blk_tracer_enabled = false;
-        trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1270,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
        }
        if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-                ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+                ret = trace_seq_printf(s, "Unknown action %x\n", what);
        else {
                ret = log_action(iter, what2act[what].act[long_act]);
                if (ret)
@@ -1195,9 +1283,6 @@ out:
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
                                               int flags)
 {
-        if (!trace_print_context(iter))
-                return TRACE_TYPE_PARTIAL_LINE;
        return print_one_line(iter, false);
 }
@@ -1232,6 +1317,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
        return print_one_line(iter, true);
 }
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+        /* don't output context-info for blk_classic output */
+        if (bit == TRACE_BLK_OPT_CLASSIC) {
+                if (set)
+                        trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+                else
+                        trace_flags |= TRACE_ITER_CONTEXT_INFO;
+        }
+        return 0;
+}
 static struct tracer blk_tracer __read_mostly = {
        .name           = "blk",
        .init           = blk_tracer_init,
@@ -1241,6 +1338,7 @@ static struct tracer blk_tracer __read_mostly = {
        .print_header   = blk_tracer_print_header,
        .print_line     = blk_tracer_print_line,
        .flags          = &blk_tracer_flags,
+        .set_flag       = blk_tracer_set_flag,
 };
 static struct trace_event trace_blk_event = {
@@ -1285,7 +1383,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 /*
 * Setup everything required to start tracing
 */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+                                 struct block_device *bdev)
 {
        struct blk_trace *old_bt, *bt = NULL;
        int ret = -ENOMEM;
@@ -1298,9 +1397,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
        if (!bt->msg_data)
                goto free_bt;
-        bt->dev = dev;
+        bt->dev = bdev->bd_dev;
        bt->act_mask = (u16)-1;
-        bt->end_lba = -1ULL;
+        blk_trace_setup_lba(bt, bdev);
        old_bt = xchg(&q->blk_trace, bt);
        if (old_bt != NULL) {
@@ -1517,7 +1617,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
        if (attr == &dev_attr_enable) {
                if (value)
-                        ret = blk_trace_setup_queue(q, bdev->bd_dev);
+                        ret = blk_trace_setup_queue(q, bdev);
                else
                        ret = blk_trace_remove_queue(q);
                goto out_unlock_bdev;
@@ -1525,7 +1625,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
        ret = 0;
        if (q->blk_trace == NULL)
-                ret = blk_trace_setup_queue(q, bdev->bd_dev);
+                ret = blk_trace_setup_queue(q, bdev);
        if (ret == 0) {
                if (attr == &dev_attr_act_mask)
@@ -1548,3 +1648,77 @@ out:
        return ret ? ret : count;
 }
+int blk_trace_init_sysfs(struct device *dev)
+{
+        return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+#ifdef CONFIG_EVENT_TRACING
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+        int i, end;
+        int len = rq->cmd_len;
+        unsigned char *cmd = rq->cmd;
+        if (!blk_pc_request(rq)) {
+                buf[0] = '\0';
+                return;
+        }
+        for (end = len - 1; end >= 0; end--)
+                if (cmd[end])
+                        break;
+        end++;
+        for (i = 0; i < len; i++) {
+                buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+                if (i == end && end != len - 1) {
+                        sprintf(buf, " ..");
+                        break;
+                }
+        }
+}
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+        int i = 0;
+        if (rw & WRITE)
+                rwbs[i++] = 'W';
+        else if (rw & 1 << BIO_RW_DISCARD)
+                rwbs[i++] = 'D';
+        else if (bytes)
+                rwbs[i++] = 'R';
+        else
+                rwbs[i++] = 'N';
+        if (rw & 1 << BIO_RW_AHEAD)
+                rwbs[i++] = 'A';
+        if (rw & 1 << BIO_RW_BARRIER)
+                rwbs[i++] = 'B';
+        if (rw & 1 << BIO_RW_SYNCIO)
+                rwbs[i++] = 'S';
+        if (rw & 1 << BIO_RW_META)
+                rwbs[i++] = 'M';
+        rwbs[i] = '\0';
+}
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+        int rw = rq->cmd_flags & 0x03;
+        int bytes;
+        if (blk_discard_rq(rq))
+                rw |= (1 << BIO_RW_DISCARD);
+        bytes = blk_rq_bytes(rq);
+        blk_fill_rwbs(rwbs, rw, bytes);
+}
+#endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 246f2aa6dc46..000000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-#include <linux/stringify.h>
-#include <trace/trace_events.h>
-#include "trace_output.h"
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c3..25edd5cc5935 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,11 +29,13 @@
 #include <linux/list.h>
 #include <linux/hash.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 #include <asm/ftrace.h>
+#include <asm/setup.h>
-#include "trace.h"
+#include "trace_output.h"
+#include "trace_stat.h"
 #define FTRACE_WARN_ON(cond)                    \
        do {                                    \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
-        .func = ftrace_stub,
+        .func           = ftrace_stub,
 };
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,581 @@ static void ftrace_update_pid_func(void)
 #endif
 }
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+        struct hlist_node               node;
+        unsigned long                   ip;
+        unsigned long                   counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        unsigned long long              time;
+#endif
+};
+struct ftrace_profile_page {
+        struct ftrace_profile_page      *next;
+        unsigned long                   index;
+        struct ftrace_profile           records[];
+};
+struct ftrace_profile_stat {
+        atomic_t                        disabled;
+        struct hlist_head               *hash;
+        struct ftrace_profile_page      *pages;
+        struct ftrace_profile_page      *start;
+        struct tracer_stat              stat;
+};
+#define PROFILE_RECORDS_SIZE                                            \
+        (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+#define PROFILES_PER_PAGE                                       \
+        (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+static void *
+function_stat_next(void *v, int idx)
+{
+        struct ftrace_profile *rec = v;
+        struct ftrace_profile_page *pg;
+        pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+ again:
+        if (idx != 0)
+                rec++;
+        if ((void *)rec >= (void *)&pg->records[pg->index]) {
+                pg = pg->next;
+                if (!pg)
+                        return NULL;
+                rec = &pg->records[0];
+                if (!rec->counter)
+                        goto again;
+        }
+        return rec;
+}
+static void *function_stat_start(struct tracer_stat *trace)
+{
+        struct ftrace_profile_stat *stat =
+                container_of(trace, struct ftrace_profile_stat, stat);
+        if (!stat || !stat->start)
+                return NULL;
+        return function_stat_next(&stat->start->records[0], 0);
+}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+        struct ftrace_profile *a = p1;
+        struct ftrace_profile *b = p2;
+        if (a->time < b->time)
+                return -1;
+        if (a->time > b->time)
+                return 1;
+        else
+                return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+        struct ftrace_profile *a = p1;
+        struct ftrace_profile *b = p2;
+        if (a->counter < b->counter)
+                return -1;
+        if (a->counter > b->counter)
+                return 1;
+        else
+                return 0;
+}
+#endif
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        seq_printf(m, "  Function                               "
+                   "Hit    Time            Avg\n"
+                      "  --------                               "
+                   "---    ----            ---\n");
+#else
+        seq_printf(m, "  Function                               Hit\n"
+                      "  --------                               ---\n");
+#endif
+        return 0;
+}
+static int function_stat_show(struct seq_file *m, void *v)
+{
+        struct ftrace_profile *rec = v;
+        char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        static DEFINE_MUTEX(mutex);
+        static struct trace_seq s;
+        unsigned long long avg;
+#endif
+        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+        seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        seq_printf(m, "    ");
+        avg = rec->time;
+        do_div(avg, rec->counter);
+        mutex_lock(&mutex);
+        trace_seq_init(&s);
+        trace_print_graph_duration(rec->time, &s);
+        trace_seq_puts(&s, "    ");
+        trace_print_graph_duration(avg, &s);
+        trace_print_seq(m, &s);
+        mutex_unlock(&mutex);
+#endif
+        seq_putc(m, '\n');
+        return 0;
+}
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+        struct ftrace_profile_page *pg;
+        pg = stat->pages = stat->start;
+        while (pg) {
+                memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+                pg->index = 0;
+                pg = pg->next;
+        }
+        memset(stat->hash, 0,
+               FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+        struct ftrace_profile_page *pg;
+        int functions;
+        int pages;
+        int i;
+        /* If we already allocated, do nothing */
+        if (stat->pages)
+                return 0;
+        stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!stat->pages)
+                return -ENOMEM;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        functions = ftrace_update_tot_cnt;
+#else
+        /*
+         * We do not know the number of functions that exist because
+         * dynamic tracing is what counts them. With past experience
+         * we have around 20K functions. That should be more than enough.
+         * It is highly unlikely we will execute every function in
+         * the kernel.
+         */
+        functions = 20000;
+#endif
+        pg = stat->start = stat->pages;
+        pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+        for (i = 0; i < pages; i++) {
+                pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+                if (!pg->next)
+                        goto out_free;
+                pg = pg->next;
+        }
+        return 0;
+ out_free:
+        pg = stat->start;
+        while (pg) {
+                unsigned long tmp = (unsigned long)pg;
+                pg = pg->next;
+                free_page(tmp);
+        }
+        free_page((unsigned long)stat->pages);
+        stat->pages = NULL;
+        stat->start = NULL;
+        return -ENOMEM;
+}
+static int ftrace_profile_init_cpu(int cpu)
+{
+        struct ftrace_profile_stat *stat;
+        int size;
+        stat = &per_cpu(ftrace_profile_stats, cpu);
+        if (stat->hash) {
+                /* If the profile is already created, simply reset it */
+                ftrace_profile_reset(stat);
+                return 0;
+        }
+        /*
+         * We are profiling all functions, but usually only a few thousand
+         * functions are hit. We'll make a hash of 1024 items.
+         */
+        size = FTRACE_PROFILE_HASH_SIZE;
+        stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+        if (!stat->hash)
+                return -ENOMEM;
+        if (!ftrace_profile_bits) {
+                size--;
+                for (; size; size >>= 1)
+                        ftrace_profile_bits++;
+        }
+        /* Preallocate the function profiling pages */
+        if (ftrace_profile_pages_init(stat) < 0) {
+                kfree(stat->hash);
+                stat->hash = NULL;
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int ftrace_profile_init(void)
+{
+        int cpu;
+        int ret = 0;
+        for_each_online_cpu(cpu) {
+                ret = ftrace_profile_init_cpu(cpu);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+        struct ftrace_profile *rec;
+        struct hlist_head *hhd;
+        struct hlist_node *n;
+        unsigned long key;
+        key = hash_long(ip, ftrace_profile_bits);
+        hhd = &stat->hash[key];
+        if (hlist_empty(hhd))
+                return NULL;
+        hlist_for_each_entry_rcu(rec, n, hhd, node) {
+                if (rec->ip == ip)
+                        return rec;
+        }
+        return NULL;
+}
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+                               struct ftrace_profile *rec)
+{
+        unsigned long key;
+        key = hash_long(rec->ip, ftrace_profile_bits);
+        hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+        struct ftrace_profile *rec = NULL;
+        /* prevent recursion (from NMIs) */
+        if (atomic_inc_return(&stat->disabled) != 1)
+                goto out;
+        /*
+         * Try to find the function again since an NMI
+         * could have added it
+         */
+        rec = ftrace_find_profiled_func(stat, ip);
+        if (rec)
+                goto out;
+        if (stat->pages->index == PROFILES_PER_PAGE) {
+                if (!stat->pages->next)
+                        goto out;
+                stat->pages = stat->pages->next;
+        }
+        rec = &stat->pages->records[stat->pages->index++];
+        rec->ip = ip;
+        ftrace_add_profile(stat, rec);
+ out:
+        atomic_dec(&stat->disabled);
+        return rec;
+}
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct ftrace_profile_stat *stat;
+        struct ftrace_profile *rec;
+        unsigned long flags;
+        if (!ftrace_profile_enabled)
+                return;
+        local_irq_save(flags);
+        stat = &__get_cpu_var(ftrace_profile_stats);
+        if (!stat->hash || !ftrace_profile_enabled)
+                goto out;
+        rec = ftrace_find_profiled_func(stat, ip);
+        if (!rec) {
+                rec = ftrace_profile_alloc(stat, ip);
+                if (!rec)
+                        goto out;
+        }
+        rec->counter++;
+ out:
+        local_irq_restore(flags);
+}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+        function_profile_call(trace->func, 0);
+        return 1;
+}
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+        struct ftrace_profile_stat *stat;
+        unsigned long long calltime;
+        struct ftrace_profile *rec;
+        unsigned long flags;
+        local_irq_save(flags);
+        stat = &__get_cpu_var(ftrace_profile_stats);
+        if (!stat->hash || !ftrace_profile_enabled)
+                goto out;
+        calltime = trace->rettime - trace->calltime;
+        if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+                int index;
+                index = trace->depth;
+                /* Append this call time to the parent time to subtract */
+                if (index)
+                        current->ret_stack[index - 1].subtime += calltime;
+                if (current->ret_stack[index].subtime < calltime)
+                        calltime -= current->ret_stack[index].subtime;
+                else
+                        calltime = 0;
+        }
+        rec = ftrace_find_profiled_func(stat, trace->func);
+        if (rec)
+                rec->time += calltime;
+ out:
+        local_irq_restore(flags);
+}
+static int register_ftrace_profiler(void)
+{
+        return register_ftrace_graph(&profile_graph_return,
+                                     &profile_graph_entry);
+}
+static void unregister_ftrace_profiler(void)
+{
+        unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+        .func           = function_profile_call,
+};
+static int register_ftrace_profiler(void)
+{
+        return register_ftrace_function(&ftrace_profile_ops);
+}
+static void unregister_ftrace_profiler(void)
+{
+        unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        unsigned long val;
+        char buf[64];           /* big enough to hold a number */
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (ret < 0)
+                return ret;
+        val = !!val;
+        mutex_lock(&ftrace_profile_lock);
+        if (ftrace_profile_enabled ^ val) {
+                if (val) {
+                        ret = ftrace_profile_init();
+                        if (ret < 0) {
+                                cnt = ret;
+                                goto out;
+                        }
+                        ret = register_ftrace_profiler();
+                        if (ret < 0) {
+                                cnt = ret;
+                                goto out;
+                        }
+                        ftrace_profile_enabled = 1;
+                } else {
+                        ftrace_profile_enabled = 0;
+                        /*
+                         * unregister_ftrace_profiler calls stop_machine
+                         * so this acts like an synchronize_sched.
+                         */
+                        unregister_ftrace_profiler();
+                }
+        }
+ out:
+        mutex_unlock(&ftrace_profile_lock);
+        filp->f_pos += cnt;
+        return cnt;
+}
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        char buf[64];           /* big enough to hold a number */
+        int r;
+        r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static const struct file_operations ftrace_profile_fops = {
+        .open           = tracing_open_generic,
+        .read           = ftrace_profile_read,
+        .write          = ftrace_profile_write,
+};
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+        .name           = "functions",
+        .stat_start     = function_stat_start,
+        .stat_next      = function_stat_next,
+        .stat_cmp       = function_stat_cmp,
+        .stat_headers   = function_stat_headers,
+        .stat_show      = function_stat_show
+};
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+        struct ftrace_profile_stat *stat;
+        struct dentry *entry;
+        char *name;
+        int ret;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                stat = &per_cpu(ftrace_profile_stats, cpu);
+                /* allocate enough for function name + cpu number */
+                name = kmalloc(32, GFP_KERNEL);
+                if (!name) {
+                        /*
+                         * The files created are permanent, if something happens
+                         * we still do not free memory.
+                         */
+                        WARN(1,
+                             "Could not allocate stat file for cpu %d\n",
+                             cpu);
+                        return;
+                }
+                stat->stat = function_stats;
+                snprintf(name, 32, "function%d", cpu);
+                stat->stat.name = name;
+                ret = register_stat_tracer(&stat->stat);
+                if (ret) {
+                        WARN(1,
+                             "Could not register function stat for cpu %d\n",
+                             cpu);
+                        kfree(name);
+                        return;
+                }
+        }
+        entry = debugfs_create_file("function_profile_enabled", 0644,
+                                    d_tracer, NULL, &ftrace_profile_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'function_profile_enabled' entry\n");
+}
+#else /* CONFIG_FUNCTION_PROFILER */
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +838,6 @@ struct ftrace_func_probe {
        struct rcu_head         rcu;
 };
 enum {
        FTRACE_ENABLE_CALLS             = (1 << 0),
        FTRACE_DISABLE_CALLS            = (1 << 1),
@@ -346,30 +922,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
        rec->flags |= FTRACE_FL_FREE;
 }
-void ftrace_release(void *start, unsigned long size)
-{
-        struct dyn_ftrace *rec;
-        struct ftrace_page *pg;
-        unsigned long s = (unsigned long)start;
-        unsigned long e = s + size;
-        if (ftrace_disabled || !start)
-                return;
-        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if ((rec->ip >= s) && (rec->ip < e)) {
-                        /*
-                         * rec->ip is changed in ftrace_free_rec()
-                         * It should not between s and e if record was freed.
-                         */
-                        FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-                        ftrace_free_rec(rec);
-                }
-        } while_for_each_ftrace_rec();
-        mutex_unlock(&ftrace_lock);
-}
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
        struct dyn_ftrace *rec;
@@ -673,6 +1225,13 @@ static void ftrace_shutdown(int command)
                return;
        ftrace_start_up--;
+        /*
+         * Just warn in case of unbalance, no need to kill ftrace, it's not
+         * critical but the ftrace_call callers may be never nopped again after
+         * further ftrace uses.
+         */
+        WARN_ON_ONCE(ftrace_start_up < 0);
        if (!ftrace_start_up)
                command |= FTRACE_DISABLE_CALLS;
@@ -859,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
        void *p = NULL;
+        loff_t l;
+        if (!(iter->flags & FTRACE_ITER_HASH))
+                *pos = 0;
        iter->flags |= FTRACE_ITER_HASH;
-        return t_hash_next(m, p, pos);
+        iter->hidx = 0;
+        for (l = 0; l <= *pos; ) {
+                p = t_hash_next(m, p, &l);
+                if (!p)
+                        break;
+        }
+        return p;
 }
 static int t_hash_show(struct seq_file *m, void *v)
@@ -909,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                        iter->pg = iter->pg->next;
                        iter->idx = 0;
                        goto retry;
-                } else {
-                        iter->idx = -1;
                }
        } else {
                rec = &iter->pg->records[iter->idx++];
@@ -939,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
        void *p = NULL;
+        loff_t l;
        mutex_lock(&ftrace_lock);
        /*
@@ -950,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
-                (*pos)++;
                return iter;
        }
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_start(m, pos);
-        if (*pos > 0) {
+        iter->pg = ftrace_pages_start;
-                if (iter->idx < 0)
+        iter->idx = 0;
-                        return p;
+        for (l = 0; l <= *pos; ) {
-                (*pos)--;
+                p = t_next(m, p, &l);
-                iter->idx--;
+                if (!p)
+                        break;
        }
-        p = t_next(m, p, pos);
+        if (!p && iter->flags & FTRACE_ITER_FILTER)
-        if (!p)
                return t_hash_start(m, pos);
        return p;
@@ -1096,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
        mutex_lock(&ftrace_regex_lock);
        if ((file->f_mode & FMODE_WRITE) &&
-            !(file->f_flags & O_APPEND))
+            (file->f_flags & O_TRUNC))
                ftrace_filter_reset(enable);
        if (file->f_mode & FMODE_READ) {
@@ -1408,7 +1974,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-        .func = function_trace_probe_call,
+        .func           = function_trace_probe_call,
 };
 static int ftrace_probe_registered;
@@ -1712,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        read++;
        cnt--;
-        if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+        /*
+         * If the parser haven't finished with the last write,
+         * continue reading the user input without skipping spaces.
+         */
+        if (!(iter->flags & FTRACE_ITER_CONT)) {
                /* skip white space */
                while (cnt && isspace(ch)) {
                        ret = get_user(ch, ubuf++);
@@ -1722,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
                        cnt--;
                }
+                /* only spaces were written */
                if (isspace(ch)) {
-                        file->f_pos += read;
+                        *ppos += read;
                        ret = read;
                        goto out;
                }
@@ -1753,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
                if (ret)
                        goto out;
                iter->buffer_idx = 0;
-        } else
+        } else {
                iter->flags |= FTRACE_ITER_CONT;
+                iter->buffer[iter->buffer_idx++] = ch;
+        }
+        *ppos += read;
-        file->f_pos += read;
        ret = read;
 out:
        mutex_unlock(&ftrace_regex_lock);
@@ -1823,6 +2394,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
        ftrace_set_regex(buf, len, reset, 0);
 }
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE              COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+static int __init set_ftrace_notrace(char *str)
+{
+        strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+        return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+static int __init set_ftrace_filter(char *str)
+{
+        strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+        return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+        char *func;
+        while (buf) {
+                func = strsep(&buf, ",");
+                ftrace_set_regex(func, strlen(func), 0, enable);
+        }
+}
+static void __init set_ftrace_early_filters(void)
+{
+        if (ftrace_filter_buf[0])
+                set_ftrace_early_filter(ftrace_filter_buf, 1);
+        if (ftrace_notrace_buf[0])
+                set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
@@ -1903,32 +2513,31 @@ int ftrace_graph_count;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
-g_next(struct seq_file *m, void *v, loff_t *pos)
+__g_next(struct seq_file *m, loff_t *pos)
 {
        unsigned long *array = m->private;
-        int index = *pos;
-        (*pos)++;
-        if (index >= ftrace_graph_count)
+        if (*pos >= ftrace_graph_count)
                return NULL;
+        return &array[*pos];
+}
-        return &array[index];
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return __g_next(m, pos);
 }
 static void *g_start(struct seq_file *m, loff_t *pos)
 {
-        void *p = NULL;
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
        if (!ftrace_graph_count && !*pos)
                return (void *)1;
-        p = g_next(m, p, pos);
+        return __g_next(m, pos);
-        return p;
 }
 static void g_stop(struct seq_file *m, void *p)
@@ -1973,7 +2582,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
        mutex_lock(&graph_lock);
        if ((file->f_mode & FMODE_WRITE) &&
-            !(file->f_flags & O_APPEND)) {
+            (file->f_flags & O_TRUNC)) {
                ftrace_graph_count = 0;
                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
@@ -1992,6 +2601,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 }
 static int
+ftrace_graph_release(struct inode *inode, struct file *file)
+{
+        if (file->f_mode & FMODE_READ)
+                seq_release(inode, file);
+        return 0;
+}
+static int
 ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
        struct dyn_ftrace *rec;
@@ -2120,46 +2737,32 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 }
 static const struct file_operations ftrace_graph_fops = {
-        .open = ftrace_graph_open,
+        .open           = ftrace_graph_open,
-        .read = seq_read,
+        .read           = seq_read,
-        .write = ftrace_graph_write,
+        .write          = ftrace_graph_write,
+        .release        = ftrace_graph_release,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-        struct dentry *entry;
-        entry = debugfs_create_file("available_filter_functions", 0444,
+        trace_create_file("available_filter_functions", 0444,
-                                    d_tracer, NULL, &ftrace_avail_fops);
+                        d_tracer, NULL, &ftrace_avail_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'available_filter_functions' entry\n");
-        entry = debugfs_create_file("failures", 0444,
+        trace_create_file("failures", 0444,
-                                    d_tracer, NULL, &ftrace_failures_fops);
+                        d_tracer, NULL, &ftrace_failures_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'failures' entry\n");
-        entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+        trace_create_file("set_ftrace_filter", 0644, d_tracer,
-                                    NULL, &ftrace_filter_fops);
+                        NULL, &ftrace_filter_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'set_ftrace_filter' entry\n");
-        entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+        trace_create_file("set_ftrace_notrace", 0644, d_tracer,
                                    NULL, &ftrace_notrace_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'set_ftrace_notrace' entry\n");
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-        entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+        trace_create_file("set_graph_function", 0444, d_tracer,
                                    NULL,
                                    &ftrace_graph_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
        return 0;
@@ -2197,14 +2800,72 @@ static int ftrace_convert_nops(struct module *mod,
        return 0;
 }
-void ftrace_init_module(struct module *mod,
+#ifdef CONFIG_MODULES
-                        unsigned long *start, unsigned long *end)
+void ftrace_release(void *start, void *end)
+{
+        struct dyn_ftrace *rec;
+        struct ftrace_page *pg;
+        unsigned long s = (unsigned long)start;
+        unsigned long e = (unsigned long)end;
+        if (ftrace_disabled || !start || start == end)
+                return;
+        mutex_lock(&ftrace_lock);
+        do_for_each_ftrace_rec(pg, rec) {
+                if ((rec->ip >= s) && (rec->ip < e)) {
+                        /*
+                         * rec->ip is changed in ftrace_free_rec()
+                         * It should not between s and e if record was freed.
+                         */
+                        FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+                        ftrace_free_rec(rec);
+                }
+        } while_for_each_ftrace_rec();
+        mutex_unlock(&ftrace_lock);
+}
+static void ftrace_init_module(struct module *mod,
+                               unsigned long *start, unsigned long *end)
 {
        if (ftrace_disabled || start == end)
                return;
        ftrace_convert_nops(mod, start, end);
 }
+static int ftrace_module_notify(struct notifier_block *self,
+                                unsigned long val, void *data)
+{
+        struct module *mod = data;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                ftrace_init_module(mod, mod->ftrace_callsites,
+                                   mod->ftrace_callsites +
+                                   mod->num_ftrace_callsites);
+                break;
+        case MODULE_STATE_GOING:
+                ftrace_release(mod->ftrace_callsites,
+                               mod->ftrace_callsites +
+                               mod->num_ftrace_callsites);
+                break;
+        }
+        return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+                                unsigned long val, void *data)
+{
+        return 0;
+}
+#endif /* CONFIG_MODULES */
+struct notifier_block ftrace_module_nb = {
+        .notifier_call = ftrace_module_notify,
+        .priority = 0,
+};
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
@@ -2236,6 +2897,12 @@ void __init ftrace_init(void)
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
+        ret = register_module_notifier(&ftrace_module_nb);
+        if (ret)
+                pr_warning("Failed to register trace ftrace module notifier\n");
+        set_ftrace_early_filters();
        return;
 failed:
        ftrace_disabled = 1;
@@ -2417,7 +3084,6 @@ static const struct file_operations ftrace_pid_fops = {
 static __init int ftrace_init_debugfs(void)
 {
        struct dentry *d_tracer;
-        struct dentry *entry;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
@@ -2425,11 +3091,11 @@ static __init int ftrace_init_debugfs(void)
        ftrace_init_dyn_debugfs(d_tracer);
-        entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
+        trace_create_file("set_ftrace_pid", 0644, d_tracer,
-                                    NULL, &ftrace_pid_fops);
+                            NULL, &ftrace_pid_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
+        ftrace_profile_debugfs(d_tracer);
-                           "'set_ftrace_pid' entry\n");
        return 0;
 }
 fs_initcall(ftrace_init_debugfs);
@@ -2507,10 +3173,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
        ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
-        if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+        if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
                goto out;
-        last_ftrace_enabled = ftrace_enabled;
+        last_ftrace_enabled = !!ftrace_enabled;
        if (ftrace_enabled) {
@@ -2538,7 +3204,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
 static struct notifier_block ftrace_suspend_notifier;
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3246,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
                }
                if (t->ret_stack == NULL) {
-                        t->curr_ret_stack = -1;
-                        /* Make sure IRQs see the -1 first: */
-                        barrier();
-                        t->ret_stack = ret_stack_list[start++];
                        atomic_set(&t->tracing_graph_pause, 0);
                        atomic_set(&t->trace_overrun, 0);
+                        t->curr_ret_stack = -1;
+                        /* Make sure the tasks see the -1 first: */
+                        smp_wmb();
+                        t->ret_stack = ret_stack_list[start++];
                }
        } while_each_thread(g, t);
@@ -2643,8 +3309,10 @@ static int start_graph_tracing(void)
                return -ENOMEM;
        /* The cpu_boot init_task->ret_stack will never be freed */
-        for_each_online_cpu(cpu)
+        for_each_online_cpu(cpu) {
-                ftrace_graph_init_task(idle_task(cpu));
+                if (!idle_task(cpu)->ret_stack)
+                        ftrace_graph_init_task(idle_task(cpu));
+        }
        do {
                ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3358,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        mutex_lock(&ftrace_lock);
        /* we currently allow only one tracer registered at a time */
-        if (atomic_read(&ftrace_graph_active)) {
+        if (ftrace_graph_active) {
                ret = -EBUSY;
                goto out;
        }
@@ -2698,10 +3366,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
        register_pm_notifier(&ftrace_suspend_notifier);
-        atomic_inc(&ftrace_graph_active);
+        ftrace_graph_active++;
        ret = start_graph_tracing();
        if (ret) {
-                atomic_dec(&ftrace_graph_active);
+                ftrace_graph_active--;
                goto out;
        }
@@ -2719,10 +3387,10 @@ void unregister_ftrace_graph(void)
 {
        mutex_lock(&ftrace_lock);
-        if (!unlikely(atomic_read(&ftrace_graph_active)))
+        if (unlikely(!ftrace_graph_active))
                goto out;
-        atomic_dec(&ftrace_graph_active);
+        ftrace_graph_active--;
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3404,25 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-        if (atomic_read(&ftrace_graph_active)) {
+        /* Make sure we do not use the parent ret_stack */
-                t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+        t->ret_stack = NULL;
+        if (ftrace_graph_active) {
+                struct ftrace_ret_stack *ret_stack;
+                ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
                                * sizeof(struct ftrace_ret_stack),
                                GFP_KERNEL);
-                if (!t->ret_stack)
+                if (!ret_stack)
                        return;
                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
                t->ftrace_timestamp = 0;
-        } else
+                /* make curr_ret_stack visable before we add the ret_stack */
-                t->ret_stack = NULL;
+                smp_wmb();
+                t->ret_stack = ret_stack;
+        }
 }
 void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e37..1edaa9516e81 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include "trace_output.h"
 #include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
                                   gfp_t gfp_flags,
                                   int node)
 {
+        struct ftrace_event_call *call = &event_kmem_alloc;
        struct trace_array *tr = kmemtrace_array;
        struct kmemtrace_alloc_entry *entry;
        struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
        entry->gfp_flags        = gfp_flags;
        entry->node             = node;
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
        trace_wake_up();
 }
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
                                  unsigned long call_site,
                                  const void *ptr)
 {
+        struct ftrace_event_call *call = &event_kmem_free;
        struct trace_array *tr = kmemtrace_array;
        struct kmemtrace_free_entry *entry;
        struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
        entry->call_site        = call_site;
        entry->ptr              = ptr;
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
        trace_wake_up();
 }
@@ -182,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
        int cpu;
        kmemtrace_array = tr;
-        for_each_cpu_mask(cpu, cpu_possible_map)
+        for_each_cpu(cpu, cpu_possible_mask)
                tracing_reset(tr, cpu);
        kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c844..a330513d96ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -22,6 +23,28 @@
 #include "trace.h"
 /*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+        int ret;
+        ret = trace_seq_printf(s, "# compressed entry header\n");
+        ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
+        ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+        ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+        ret = trace_seq_printf(s, "\n");
+        ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+                               RINGBUF_TYPE_PADDING);
+        ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+                               RINGBUF_TYPE_TIME_EXTEND);
+        ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+                               RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+        return ret;
+}
+/*
 * The ring buffer is made up of a list of pages. A separate list of pages is
 * allocated for each CPU. A writer may only write to a buffer that is
 * associated with the CPU it is currently executing on.  A reader may read
@@ -182,7 +205,11 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT            4U
-#define RB_MAX_SMALL_DATA       28
+#define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 enum {
        RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +218,28 @@ enum {
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
-        return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+        return event->type_len == RINGBUF_TYPE_PADDING
+                        && event->time_delta == 0;
 }
 static inline int rb_discarded_event(struct ring_buffer_event *event)
 {
-        return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+        return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
 }
 static void rb_event_set_padding(struct ring_buffer_event *event)
 {
-        event->type = RINGBUF_TYPE_PADDING;
+        event->type_len = RINGBUF_TYPE_PADDING;
        event->time_delta = 0;
 }
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-        event->type = RINGBUF_TYPE_PADDING;
-        /* time delta must be non zero */
-        if (!event->time_delta)
-                event->time_delta = 1;
-}
 static unsigned
 rb_event_data_length(struct ring_buffer_event *event)
 {
        unsigned length;
-        if (event->len)
+        if (event->type_len)
-                length = event->len * RB_ALIGNMENT;
+                length = event->type_len * RB_ALIGNMENT;
        else
                length = event->array[0];
        return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +249,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
-        switch (event->type) {
+        switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                if (rb_null_event(event))
                        /* undefined */
                        return -1;
-                return rb_event_data_length(event);
+                return  event->array[0] + RB_EVNT_HDR_SIZE;
        case RINGBUF_TYPE_TIME_EXTEND:
                return RB_LEN_TIME_EXTEND;
@@ -271,7 +278,7 @@ rb_event_length(struct ring_buffer_event *event)
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
        unsigned length = rb_event_length(event);
-        if (event->type != RINGBUF_TYPE_DATA)
+        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
        if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +291,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
-        BUG_ON(event->type != RINGBUF_TYPE_DATA);
+        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
-        if (event->len)
+        if (event->type_len)
                return (void *)&event->array[0];
        /* Otherwise length is in array[0] and array[1] has the data */
        return (void *)&event->array[1];
@@ -316,9 +323,10 @@ struct buffer_data_page {
 };
 struct buffer_page {
+        struct list_head list;          /* list of buffer pages */
        local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
-        struct list_head list;          /* list of free pages */
+        local_t          entries;       /* entries on this page */
        struct buffer_data_page *page;  /* Actual data page */
 };
@@ -361,6 +369,34 @@ static inline int test_time_stamp(u64 delta)
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE  (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+        struct buffer_data_page field;
+        int ret;
+        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+                               "offset:0;\tsize:%u;\n",
+                               (unsigned int)sizeof(field.time_stamp));
+        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+                               "offset:%u;\tsize:%u;\n",
+                               (unsigned int)offsetof(typeof(field), commit),
+                               (unsigned int)sizeof(field.commit));
+        ret = trace_seq_printf(s, "\tfield: char data;\t"
+                               "offset:%u;\tsize:%u;\n",
+                               (unsigned int)offsetof(typeof(field), data),
+                               (unsigned int)BUF_PAGE_SIZE);
+        return ret;
+}
 /*
 * head_page == tail_page && head == tail then buffer is empty.
 */
@@ -375,8 +411,13 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
+        unsigned long                   nmi_dropped;
+        unsigned long                   commit_overrun;
        unsigned long                   overrun;
-        unsigned long                   entries;
+        unsigned long                   read;
+        local_t                         entries;
+        local_t                         committing;
+        local_t                         commits;
        u64                             write_stamp;
        u64                             read_stamp;
        atomic_t                        record_disabled;
@@ -389,6 +430,8 @@ struct ring_buffer {
        atomic_t                        record_disabled;
        cpumask_var_t                   cpumask;
+        struct lock_class_key           *reader_lock_key;
        struct mutex                    mutex;
        struct ring_buffer_per_cpu      **buffers;
@@ -420,13 +463,18 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+        /* shift to debug/test normalization and TIME_EXTENTS */
+        return buffer->clock() << DEBUG_SHIFT;
+}
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 {
        u64 time;
        preempt_disable_notrace();
-        /* shift to debug/test normalization and TIME_EXTENTS */
+        time = rb_time_stamp(buffer, cpu);
-        time = buffer->clock() << DEBUG_SHIFT;
        preempt_enable_no_resched_notrace();
        return time;
@@ -523,6 +571,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        cpu_buffer->cpu = cpu;
        cpu_buffer->buffer = buffer;
        spin_lock_init(&cpu_buffer->reader_lock);
+        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        INIT_LIST_HEAD(&cpu_buffer->pages);
@@ -572,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
        kfree(cpu_buffer);
 }
-/*
- * Causes compile errors if the struct buffer_page gets bigger
- * than the struct page.
- */
-extern int ring_buffer_page_too_big(void);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu);
@@ -593,17 +636,13 @@ static int rb_cpu_notify(struct notifier_block *self,
 * when the buffer wraps. If this flag is not set, the buffer will
 * drop data when the tail hits the head.
 */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+                                        struct lock_class_key *key)
 {
        struct ring_buffer *buffer;
        int bsize;
        int cpu;
-        /* Paranoid! Optimizes out when all is well */
-        if (sizeof(struct buffer_page) > sizeof(struct page))
-                ring_buffer_page_too_big();
        /* keep it in its own cache line */
        buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
                         GFP_KERNEL);
@@ -616,10 +655,11 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
        buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        buffer->flags = flags;
        buffer->clock = trace_clock_local;
+        buffer->reader_lock_key = key;
        /* need at least two pages */
-        if (buffer->pages == 1)
+        if (buffer->pages < 2)
-                buffer->pages++;
+                buffer->pages = 2;
        /*
         * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -673,7 +713,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
        kfree(buffer);
        return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 /**
 * ring_buffer_free - free a ring buffer.
@@ -695,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
        put_online_cpus();
+        kfree(buffer->buffers);
        free_cpumask_var(buffer->cpumask);
        kfree(buffer);
@@ -947,31 +988,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
        return rb_page_commit(cpu_buffer->head_page);
 }
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
-        struct ring_buffer_event *event;
-        unsigned long head;
-        for (head = 0; head < rb_head_size(cpu_buffer);
-             head += rb_event_length(event)) {
-                event = __rb_page_index(cpu_buffer->head_page, head);
-                if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-                        return;
-                /* Only count data entries */
-                if (event->type != RINGBUF_TYPE_DATA)
-                        continue;
-                cpu_buffer->overrun++;
-                cpu_buffer->entries--;
-        }
-}
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
                               struct buffer_page **bpage)
 {
@@ -988,12 +1004,12 @@ rb_event_index(struct ring_buffer_event *event)
 {
        unsigned long addr = (unsigned long)event;
-        return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+        return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
-static int
+static inline int
-rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-             struct ring_buffer_event *event)
+                   struct ring_buffer_event *event)
 {
        unsigned long addr = (unsigned long)event;
        unsigned long index;
@@ -1006,31 +1022,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 }
 static void
-rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
-                    struct ring_buffer_event *event)
-{
-        unsigned long addr = (unsigned long)event;
-        unsigned long index;
-        index = rb_event_index(event);
-        addr &= PAGE_MASK;
-        while (cpu_buffer->commit_page->page != (void *)addr) {
-                if (RB_WARN_ON(cpu_buffer,
-                          cpu_buffer->commit_page == cpu_buffer->tail_page))
-                        return;
-                cpu_buffer->commit_page->page->commit =
-                        cpu_buffer->commit_page->write;
-                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-                cpu_buffer->write_stamp =
-                        cpu_buffer->commit_page->page->time_stamp;
-        }
-        /* Now set the commit to the event's index */
-        local_set(&cpu_buffer->commit_page->page->commit, index);
-}
-static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
        /*
@@ -1110,28 +1101,21 @@ static void
 rb_update_event(struct ring_buffer_event *event,
                         unsigned type, unsigned length)
 {
-        event->type = type;
+        event->type_len = type;
        switch (type) {
        case RINGBUF_TYPE_PADDING:
-                break;
        case RINGBUF_TYPE_TIME_EXTEND:
-                event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-                break;
        case RINGBUF_TYPE_TIME_STAMP:
-                event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
                break;
-        case RINGBUF_TYPE_DATA:
+        case 0:
                length -= RB_EVNT_HDR_SIZE;
-                if (length > RB_MAX_SMALL_DATA) {
+                if (length > RB_MAX_SMALL_DATA)
-                        event->len = 0;
                        event->array[0] = length;
-                } else
+                else
-                        event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
                break;
        default:
                BUG();
@@ -1155,158 +1139,241 @@ static unsigned rb_calculate_event_length(unsigned length)
        return length;
 }
-static struct ring_buffer_event *
+static inline void
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
-                  unsigned type, unsigned long length, u64 *ts)
+              struct buffer_page *tail_page,
+              unsigned long tail, unsigned long length)
 {
-        struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
-        unsigned long tail, write;
-        struct ring_buffer *buffer = cpu_buffer->buffer;
        struct ring_buffer_event *event;
-        unsigned long flags;
-        bool lock_taken = false;
-        commit_page = cpu_buffer->commit_page;
+        /*
-        /* we just need to protect against interrupts */
+         * Only the event that crossed the page boundary
-        barrier();
+         * must fill the old tail_page with padding.
-        tail_page = cpu_buffer->tail_page;
+         */
-        write = local_add_return(length, &tail_page->write);
+        if (tail >= BUF_PAGE_SIZE) {
-        tail = write - length;
+                local_sub(length, &tail_page->write);
+                return;
+        }
-        /* See if we shot pass the end of this buffer page */
+        event = __rb_page_index(tail_page, tail);
-        if (write > BUF_PAGE_SIZE) {
+        kmemcheck_annotate_bitfield(event, bitfield);
-                struct buffer_page *next_page = tail_page;
-                local_irq_save(flags);
+        /*
-                /*
+         * If this event is bigger than the minimum size, then
-                 * Since the write to the buffer is still not
+         * we need to be careful that we don't subtract the
-                 * fully lockless, we must be careful with NMIs.
+         * write counter enough to allow another writer to slip
-                 * The locks in the writers are taken when a write
+         * in on this page.
-                 * crosses to a new page. The locks protect against
+         * We put in a discarded commit instead, to make sure
-                 * races with the readers (this will soon be fixed
+         * that this space is not used again.
-                 * with a lockless solution).
+         *
-                 *
+         * If we are less than the minimum size, we don't need to
-                 * Because we can not protect against NMIs, and we
+         * worry about it.
-                 * want to keep traces reentrant, we need to manage
+         */
-                 * what happens when we are in an NMI.
+        if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
-                 *
+                /* No room for any events */
-                 * NMIs can happen after we take the lock.
-                 * If we are in an NMI, only take the lock
-                 * if it is not already taken. Otherwise
-                 * simply fail.
-                 */
-                if (unlikely(in_nmi())) {
-                        if (!__raw_spin_trylock(&cpu_buffer->lock))
-                                goto out_reset;
-                } else
-                        __raw_spin_lock(&cpu_buffer->lock);
-                lock_taken = true;
+                /* Mark the rest of the page with padding */
+                rb_event_set_padding(event);
-                rb_inc_page(cpu_buffer, &next_page);
+                /* Set the write back to the previous setting */
+                local_sub(length, &tail_page->write);
+                return;
+        }
-                head_page = cpu_buffer->head_page;
+        /* Put in a discarded event */
-                reader_page = cpu_buffer->reader_page;
+        event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+        event->type_len = RINGBUF_TYPE_PADDING;
+        /* time delta must be non zero */
+        event->time_delta = 1;
+        /* Account for this as an entry */
+        local_inc(&tail_page->entries);
+        local_inc(&cpu_buffer->entries);
-                /* we grabbed the lock before incrementing */
+        /* Set write to end of buffer */
-                if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+        length = (tail + length) - BUF_PAGE_SIZE;
-                        goto out_reset;
+        local_sub(length, &tail_page->write);
+}
-                /*
+static struct ring_buffer_event *
-                 * If for some reason, we had an interrupt storm that made
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
-                 * it all the way around the buffer, bail, and warn
+             unsigned long length, unsigned long tail,
-                 * about it.
+             struct buffer_page *commit_page,
-                 */
+             struct buffer_page *tail_page, u64 *ts)
-                if (unlikely(next_page == commit_page)) {
+{
-                        WARN_ON_ONCE(1);
+        struct buffer_page *next_page, *head_page, *reader_page;
+        struct ring_buffer *buffer = cpu_buffer->buffer;
+        bool lock_taken = false;
+        unsigned long flags;
+        next_page = tail_page;
+        local_irq_save(flags);
+        /*
+         * Since the write to the buffer is still not
+         * fully lockless, we must be careful with NMIs.
+         * The locks in the writers are taken when a write
+         * crosses to a new page. The locks protect against
+         * races with the readers (this will soon be fixed
+         * with a lockless solution).
+         *
+         * Because we can not protect against NMIs, and we
+         * want to keep traces reentrant, we need to manage
+         * what happens when we are in an NMI.
+         *
+         * NMIs can happen after we take the lock.
+         * If we are in an NMI, only take the lock
+         * if it is not already taken. Otherwise
+         * simply fail.
+         */
+        if (unlikely(in_nmi())) {
+                if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+                        cpu_buffer->nmi_dropped++;
                        goto out_reset;
                }
+        } else
+                __raw_spin_lock(&cpu_buffer->lock);
-                if (next_page == head_page) {
+        lock_taken = true;
-                        if (!(buffer->flags & RB_FL_OVERWRITE))
-                                goto out_reset;
-                        /* tail_page has not moved yet? */
+        rb_inc_page(cpu_buffer, &next_page);
-                        if (tail_page == cpu_buffer->tail_page) {
-                                /* count overflows */
-                                rb_update_overflow(cpu_buffer);
-                                rb_inc_page(cpu_buffer, &head_page);
+        head_page = cpu_buffer->head_page;
-                                cpu_buffer->head_page = head_page;
+        reader_page = cpu_buffer->reader_page;
-                                cpu_buffer->head_page->read = 0;
-                        }
-                }
-                /*
+        /* we grabbed the lock before incrementing */
-                 * If the tail page is still the same as what we think
+        if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-                 * it is, then it is up to us to update the tail
+                goto out_reset;
-                 * pointer.
-                 */
+        /*
+         * If for some reason, we had an interrupt storm that made
+         * it all the way around the buffer, bail, and warn
+         * about it.
+         */
+        if (unlikely(next_page == commit_page)) {
+                cpu_buffer->commit_overrun++;
+                goto out_reset;
+        }
+        if (next_page == head_page) {
+                if (!(buffer->flags & RB_FL_OVERWRITE))
+                        goto out_reset;
+                /* tail_page has not moved yet? */
                if (tail_page == cpu_buffer->tail_page) {
-                        local_set(&next_page->write, 0);
+                        /* count overflows */
-                        local_set(&next_page->page->commit, 0);
+                        cpu_buffer->overrun +=
-                        cpu_buffer->tail_page = next_page;
+                                local_read(&head_page->entries);
-                        /* reread the time stamp */
+                        rb_inc_page(cpu_buffer, &head_page);
-                        *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+                        cpu_buffer->head_page = head_page;
-                        cpu_buffer->tail_page->page->time_stamp = *ts;
+                        cpu_buffer->head_page->read = 0;
                }
+        }
-                /*
+        /*
-                 * The actual tail page has moved forward.
+         * If the tail page is still the same as what we think
-                 */
+         * it is, then it is up to us to update the tail
-                if (tail < BUF_PAGE_SIZE) {
+         * pointer.
-                        /* Mark the rest of the page with padding */
+         */
-                        event = __rb_page_index(tail_page, tail);
+        if (tail_page == cpu_buffer->tail_page) {
-                        rb_event_set_padding(event);
+                local_set(&next_page->write, 0);
-                }
+                local_set(&next_page->entries, 0);
+                local_set(&next_page->page->commit, 0);
+                cpu_buffer->tail_page = next_page;
+                /* reread the time stamp */
+                *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+                cpu_buffer->tail_page->page->time_stamp = *ts;
+        }
-                if (tail <= BUF_PAGE_SIZE)
+        rb_reset_tail(cpu_buffer, tail_page, tail, length);
-                        /* Set the write back to the previous setting */
-                        local_set(&tail_page->write, tail);
-                /*
+        __raw_spin_unlock(&cpu_buffer->lock);
-                 * If this was a commit entry that failed,
+        local_irq_restore(flags);
-                 * increment that too
-                 */
+        /* fail and let the caller try again */
-                if (tail_page == cpu_buffer->commit_page &&
+        return ERR_PTR(-EAGAIN);
-                    tail == rb_commit_index(cpu_buffer)) {
-                        rb_set_commit_to_write(cpu_buffer);
+ out_reset:
-                }
+        /* reset write */
+        rb_reset_tail(cpu_buffer, tail_page, tail, length);
+        if (likely(lock_taken))
                __raw_spin_unlock(&cpu_buffer->lock);
-                local_irq_restore(flags);
+        local_irq_restore(flags);
+        return NULL;
+}
-                /* fail and let the caller try again */
+static struct ring_buffer_event *
-                return ERR_PTR(-EAGAIN);
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-        }
+                  unsigned type, unsigned long length, u64 *ts)
+{
+        struct buffer_page *tail_page, *commit_page;
+        struct ring_buffer_event *event;
+        unsigned long tail, write;
-        /* We reserved something on the buffer */
+        commit_page = cpu_buffer->commit_page;
+        /* we just need to protect against interrupts */
+        barrier();
+        tail_page = cpu_buffer->tail_page;
+        write = local_add_return(length, &tail_page->write);
+        tail = write - length;
-        if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+        /* See if we shot pass the end of this buffer page */
-                return NULL;
+        if (write > BUF_PAGE_SIZE)
+                return rb_move_tail(cpu_buffer, length, tail,
+                                    commit_page, tail_page, ts);
+        /* We reserved something on the buffer */
        event = __rb_page_index(tail_page, tail);
+        kmemcheck_annotate_bitfield(event, bitfield);
        rb_update_event(event, type, length);
+        /* The passed in type is zero for DATA */
+        if (likely(!type))
+                local_inc(&tail_page->entries);
        /*
-         * If this is a commit and the tail is zero, then update
+         * If this is the first commit on the page, then update
-         * this page's time stamp.
+         * its timestamp.
         */
-        if (!tail && rb_is_commit(cpu_buffer, event))
+        if (!tail)
-                cpu_buffer->commit_page->page->time_stamp = *ts;
+                tail_page->page->time_stamp = *ts;
        return event;
+}
- out_reset:
+static inline int
-        /* reset write */
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
-        if (tail <= BUF_PAGE_SIZE)
+                  struct ring_buffer_event *event)
-                local_set(&tail_page->write, tail);
+{
+        unsigned long new_index, old_index;
+        struct buffer_page *bpage;
+        unsigned long index;
+        unsigned long addr;
-        if (likely(lock_taken))
+        new_index = rb_event_index(event);
-                __raw_spin_unlock(&cpu_buffer->lock);
+        old_index = new_index + rb_event_length(event);
-        local_irq_restore(flags);
+        addr = (unsigned long)event;
-        return NULL;
+        addr &= PAGE_MASK;
+        bpage = cpu_buffer->tail_page;
+        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+                /*
+                 * This is on the tail page. It is possible that
+                 * a write could come in and move the tail page
+                 * and write to the next page. That is fine
+                 * because we just shorten what is on this page.
+                 */
+                index = local_cmpxchg(&bpage->write, old_index, new_index);
+                if (index == old_index)
+                        return 1;
+        }
+        /* could not discard */
+        return 0;
 }
 static int
@@ -1341,26 +1408,33 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                return -EAGAIN;
        /* Only a commited time event can update the write stamp */
-        if (rb_is_commit(cpu_buffer, event)) {
+        if (rb_event_is_commit(cpu_buffer, event)) {
                /*
-                 * If this is the first on the page, then we need to
+                 * If this is the first on the page, then it was
-                 * update the page itself, and just put in a zero.
+                 * updated with the page itself. Try to discard it
+                 * and if we can't just make it zero.
                 */
                if (rb_event_index(event)) {
                        event->time_delta = *delta & TS_MASK;
                        event->array[0] = *delta >> TS_SHIFT;
                } else {
-                        cpu_buffer->commit_page->page->time_stamp = *ts;
+                        /* try to discard, since we do not need this */
-                        event->time_delta = 0;
+                        if (!rb_try_to_discard(cpu_buffer, event)) {
-                        event->array[0] = 0;
+                                /* nope, just zero it */
+                                event->time_delta = 0;
+                                event->array[0] = 0;
+                        }
                }
                cpu_buffer->write_stamp = *ts;
                /* let the caller know this was the commit */
                ret = 1;
        } else {
-                /* Darn, this is just wasted space */
+                /* Try to discard the event */
-                event->time_delta = 0;
+                if (!rb_try_to_discard(cpu_buffer, event)) {
-                event->array[0] = 0;
+                        /* Darn, this is just wasted space */
+                        event->time_delta = 0;
+                        event->array[0] = 0;
+                }
                ret = 0;
        }
@@ -1369,15 +1443,56 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
        return ret;
 }
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        local_inc(&cpu_buffer->committing);
+        local_inc(&cpu_buffer->commits);
+}
+static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        unsigned long commits;
+        if (RB_WARN_ON(cpu_buffer,
+                       !local_read(&cpu_buffer->committing)))
+                return;
+ again:
+        commits = local_read(&cpu_buffer->commits);
+        /* synchronize with interrupts */
+        barrier();
+        if (local_read(&cpu_buffer->committing) == 1)
+                rb_set_commit_to_write(cpu_buffer);
+        local_dec(&cpu_buffer->committing);
+        /* synchronize with interrupts */
+        barrier();
+        /*
+         * Need to account for interrupts coming in between the
+         * updating of the commit page and the clearing of the
+         * committing counter.
+         */
+        if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+            !local_read(&cpu_buffer->committing)) {
+                local_inc(&cpu_buffer->committing);
+                goto again;
+        }
+}
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
-                      unsigned type, unsigned long length)
+                      unsigned long length)
 {
        struct ring_buffer_event *event;
-        u64 ts, delta;
+        u64 ts, delta = 0;
        int commit = 0;
        int nr_loops = 0;
+        rb_start_commit(cpu_buffer);
+        length = rb_calculate_event_length(length);
 again:
        /*
         * We allow for interrupts to reenter here and do a trace.
@@ -1389,9 +1504,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         * Bail!
         */
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
-                return NULL;
+                goto out_fail;
-        ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+        ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
        /*
         * Only the first commit can update the timestamp.
@@ -1401,63 +1516,93 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         * also be made. But only the entry that did the actual
         * commit will be something other than zero.
         */
-        if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
+        if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-            rb_page_write(cpu_buffer->tail_page) ==
+                   rb_page_write(cpu_buffer->tail_page) ==
-            rb_commit_index(cpu_buffer)) {
+                   rb_commit_index(cpu_buffer))) {
+                u64 diff;
-                delta = ts - cpu_buffer->write_stamp;
+                diff = ts - cpu_buffer->write_stamp;
-                /* make sure this delta is calculated here */
+                /* make sure this diff is calculated here */
                barrier();
                /* Did the write stamp get updated already? */
                if (unlikely(ts < cpu_buffer->write_stamp))
-                        delta = 0;
+                        goto get_event;
-                if (test_time_stamp(delta)) {
+                delta = diff;
+                if (unlikely(test_time_stamp(delta))) {
                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
                        if (commit == -EBUSY)
-                                return NULL;
+                                goto out_fail;
                        if (commit == -EAGAIN)
                                goto again;
                        RB_WARN_ON(cpu_buffer, commit < 0);
                }
-        } else
+        }
-                /* Non commits have zero deltas */
-                delta = 0;
-        event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ get_event:
-        if (PTR_ERR(event) == -EAGAIN)
+        event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
-        if (!event) {
+        if (!event)
-                if (unlikely(commit))
+                goto out_fail;
-                        /*
-                         * Ouch! We needed a timestamp and it was commited. But
-                         * we didn't get our event reserved.
-                         */
-                        rb_set_commit_to_write(cpu_buffer);
-                return NULL;
-        }
-        /*
+        if (!rb_event_is_commit(cpu_buffer, event))
-         * If the timestamp was commited, make the commit our entry
-         * now so that we will update it when needed.
-         */
-        if (commit)
-                rb_set_commit_event(cpu_buffer, event);
-        else if (!rb_is_commit(cpu_buffer, event))
                delta = 0;
        event->time_delta = delta;
        return event;
+ out_fail:
+        rb_end_commit(cpu_buffer);
+        return NULL;
+}
+#ifdef CONFIG_TRACING
+#define TRACE_RECURSIVE_DEPTH 16
+static int trace_recursive_lock(void)
+{
+        current->trace_recursion++;
+        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+                return 0;
+        /* Disable all tracing before we do anything else */
+        tracing_off_permanent();
+        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
+                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+                    current->trace_recursion,
+                    hardirq_count() >> HARDIRQ_SHIFT,
+                    softirq_count() >> SOFTIRQ_SHIFT,
+                    in_nmi());
+        WARN_ON_ONCE(1);
+        return -1;
+}
+static void trace_recursive_unlock(void)
+{
+        WARN_ON_ONCE(!current->trace_recursion);
+        current->trace_recursion--;
 }
+#else
+#define trace_recursive_lock()          (0)
+#define trace_recursive_unlock()        do { } while (0)
+#endif
 static DEFINE_PER_CPU(int, rb_need_resched);
 /**
@@ -1491,6 +1636,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        /* If we are tracing schedule, we don't want to recurse */
        resched = ftrace_preempt_disable();
+        if (trace_recursive_lock())
+                goto out_nocheck;
        cpu = raw_smp_processor_id();
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1649,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (atomic_read(&cpu_buffer->record_disabled))
                goto out;
-        length = rb_calculate_event_length(length);
+        if (length > BUF_MAX_DATA_SIZE)
-        if (length > BUF_PAGE_SIZE)
                goto out;
-        event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+        event = rb_reserve_next_event(cpu_buffer, length);
        if (!event)
                goto out;
@@ -1520,6 +1667,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        return event;
 out:
+        trace_recursive_unlock();
+ out_nocheck:
        ftrace_preempt_enable(resched);
        return NULL;
 }
@@ -1528,15 +1678,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
 {
-        cpu_buffer->entries++;
+        local_inc(&cpu_buffer->entries);
-        /* Only process further if we own the commit */
+        /*
-        if (!rb_is_commit(cpu_buffer, event))
+         * The event first in the commit queue updates the
-                return;
+         * time stamp.
+         */
-        cpu_buffer->write_stamp += event->time_delta;
+        if (rb_event_is_commit(cpu_buffer, event))
+                cpu_buffer->write_stamp += event->time_delta;
-        rb_set_commit_to_write(cpu_buffer);
+        rb_end_commit(cpu_buffer);
 }
 /**
@@ -1558,6 +1709,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
        rb_commit(cpu_buffer, event);
+        trace_recursive_unlock();
        /*
         * Only the last preempt count needs to restore preemption.
         */
@@ -1570,6 +1723,93 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+        /* array[0] holds the actual length for the discarded event */
+        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+        event->type_len = RINGBUF_TYPE_PADDING;
+        /* time delta must be non zero */
+        if (!event->time_delta)
+                event->time_delta = 1;
+}
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+        rb_event_discard(event);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+                                struct ring_buffer_event *event)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        int cpu;
+        /* The event is discarded regardless */
+        rb_event_discard(event);
+        cpu = smp_processor_id();
+        cpu_buffer = buffer->buffers[cpu];
+        /*
+         * This must only be called if the event has not been
+         * committed yet. Thus we can assume that preemption
+         * is still disabled.
+         */
+        RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
+        if (rb_try_to_discard(cpu_buffer, event))
+                goto out;
+        /*
+         * The commit is still visible by the reader, so we
+         * must increment entries.
+         */
+        local_inc(&cpu_buffer->entries);
+ out:
+        rb_end_commit(cpu_buffer);
+        trace_recursive_unlock();
+        /*
+         * Only the last preempt count needs to restore preemption.
+         */
+        if (preempt_count() == 1)
+                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+        else
+                preempt_enable_no_resched_notrace();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
 /**
 * ring_buffer_write - write data to the buffer without reserving
 * @buffer: The ring buffer to write to.
@@ -1589,7 +1829,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
-        unsigned long event_length;
        void *body;
        int ret = -EBUSY;
        int cpu, resched;
@@ -1612,9 +1851,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
        if (atomic_read(&cpu_buffer->record_disabled))
                goto out;
-        event_length = rb_calculate_event_length(length);
+        if (length > BUF_MAX_DATA_SIZE)
-        event = rb_reserve_next_event(cpu_buffer,
+                goto out;
-                                      RINGBUF_TYPE_DATA, event_length);
+        event = rb_reserve_next_event(cpu_buffer, length);
        if (!event)
                goto out;
@@ -1728,7 +1968,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
                return 0;
        cpu_buffer = buffer->buffers[cpu];
-        ret = cpu_buffer->entries;
+        ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+                - cpu_buffer->read;
        return ret;
 }
@@ -1755,6 +1996,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 /**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        ret = cpu_buffer->nmi_dropped;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        ret = cpu_buffer->commit_overrun;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+/**
 * ring_buffer_entries - get the number of entries in a buffer
 * @buffer: The ring buffer
 *
@@ -1770,7 +2052,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-                entries += cpu_buffer->entries;
+                entries += (local_read(&cpu_buffer->entries) -
+                            cpu_buffer->overrun) - cpu_buffer->read;
        }
        return entries;
@@ -1862,7 +2145,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
        u64 delta;
-        switch (event->type) {
+        switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                return;
@@ -1893,7 +2176,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 {
        u64 delta;
-        switch (event->type) {
+        switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                return;
@@ -1966,6 +2249,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page->list.prev = reader->list.prev;
        local_set(&cpu_buffer->reader_page->write, 0);
+        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
        /* Make the reader page now replace the head */
@@ -2008,8 +2292,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
        event = rb_reader_event(cpu_buffer);
-        if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
+        if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
-                cpu_buffer->entries--;
+                        || rb_discarded_event(event))
+                cpu_buffer->read++;
        rb_update_read_stamp(cpu_buffer, event);
@@ -2031,8 +2316,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
         * Check if we are at the end of the buffer.
         */
        if (iter->head >= rb_page_size(iter->head_page)) {
-                if (RB_WARN_ON(buffer,
+                /* discarded commits can make the page empty */
-                               iter->head_page == cpu_buffer->commit_page))
+                if (iter->head_page == cpu_buffer->commit_page)
                        return;
                rb_inc_iter(iter);
                return;
@@ -2075,12 +2360,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        /*
         * We repeat when a timestamp is encountered. It is possible
         * to get multiple timestamps from an interrupt entering just
-         * as one timestamp is about to be written. The max times
+         * as one timestamp is about to be written, or from discarded
-         * that this can happen is the number of nested interrupts we
+         * commits. The most that we can have is the number on a single page.
-         * can have.  Nesting 10 deep of interrupts is clearly
-         * an anomaly.
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
                return NULL;
        reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2372,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        event = rb_reader_event(cpu_buffer);
-        switch (event->type) {
+        switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                if (rb_null_event(event))
                        RB_WARN_ON(cpu_buffer, 1);
@@ -2101,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
                 * the box. Return the padding, and we will release
                 * the current locks, and try again.
                 */
-                rb_advance_reader(cpu_buffer);
                return event;
        case RINGBUF_TYPE_TIME_EXTEND:
@@ -2146,14 +2428,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 again:
        /*
-         * We repeat when a timestamp is encountered. It is possible
+         * We repeat when a timestamp is encountered.
-         * to get multiple timestamps from an interrupt entering just
+         * We can get multiple timestamps by nested interrupts or also
-         * as one timestamp is about to be written. The max times
+         * if filtering is on (discarding commits). Since discarding
-         * that this can happen is the number of nested interrupts we
+         * commits can be frequent we can get a lot of timestamps.
-         * can have. Nesting 10 deep of interrupts is clearly
+         * But we limit them by not adding timestamps if they begin
-         * an anomaly.
+         * at the start of a page.
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
                return NULL;
        if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2443,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        event = rb_iter_head_event(iter);
-        switch (event->type) {
+        switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                if (rb_null_event(event)) {
                        rb_inc_iter(iter);
@@ -2196,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
+static inline int rb_ok_to_lock(void)
+{
+        /*
+         * If an NMI die dumps out the content of the ring buffer
+         * do not grab locks. We also permanently disable the ring
+         * buffer too. A one time deal is all you get from reading
+         * the ring buffer from an NMI.
+         */
+        if (likely(!in_nmi()))
+                return 1;
+        tracing_off_permanent();
+        return 0;
+}
 /**
 * ring_buffer_peek - peek at the next event to be read
 * @buffer: The ring buffer to read
@@ -2211,16 +2508,24 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
        unsigned long flags;
+        int dolock;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return NULL;
+        dolock = rb_ok_to_lock();
 again:
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        local_irq_save(flags);
+        if (dolock)
+                spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(buffer, cpu, ts);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        if (event && event->type_len == RINGBUF_TYPE_PADDING)
+                rb_advance_reader(cpu_buffer);
+        if (dolock)
+                spin_unlock(&cpu_buffer->reader_lock);
+        local_irq_restore(flags);
-        if (event && event->type == RINGBUF_TYPE_PADDING) {
+        if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2248,7 +2553,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        event = rb_iter_peek(iter, ts);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-        if (event && event->type == RINGBUF_TYPE_PADDING) {
+        if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2270,6 +2575,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event = NULL;
        unsigned long flags;
+        int dolock;
+        dolock = rb_ok_to_lock();
 again:
        /* might be called in atomic */
@@ -2279,21 +2587,22 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
                goto out;
        cpu_buffer = buffer->buffers[cpu];
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        local_irq_save(flags);
+        if (dolock)
+                spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(buffer, cpu, ts);
-        if (!event)
+        if (event)
-                goto out_unlock;
+                rb_advance_reader(cpu_buffer);
-        rb_advance_reader(cpu_buffer);
- out_unlock:
+        if (dolock)
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+                spin_unlock(&cpu_buffer->reader_lock);
+        local_irq_restore(flags);
 out:
        preempt_enable();
-        if (event && event->type == RINGBUF_TYPE_PADDING) {
+        if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2386,7 +2695,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 out:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-        if (event && event->type == RINGBUF_TYPE_PADDING) {
+        if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2411,6 +2720,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->head_page
                = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
        local_set(&cpu_buffer->head_page->write, 0);
+        local_set(&cpu_buffer->head_page->entries, 0);
        local_set(&cpu_buffer->head_page->page->commit, 0);
        cpu_buffer->head_page->read = 0;
@@ -2420,11 +2730,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
        local_set(&cpu_buffer->reader_page->write, 0);
+        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
        cpu_buffer->reader_page->read = 0;
+        cpu_buffer->nmi_dropped = 0;
+        cpu_buffer->commit_overrun = 0;
        cpu_buffer->overrun = 0;
-        cpu_buffer->entries = 0;
+        cpu_buffer->read = 0;
+        local_set(&cpu_buffer->entries, 0);
+        local_set(&cpu_buffer->committing, 0);
+        local_set(&cpu_buffer->commits, 0);
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
@@ -2443,6 +2759,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return;
+        atomic_inc(&cpu_buffer->record_disabled);
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2770,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        __raw_spin_unlock(&cpu_buffer->lock);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        atomic_dec(&cpu_buffer->record_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -2475,12 +2795,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
 int ring_buffer_empty(struct ring_buffer *buffer)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long flags;
+        int dolock;
        int cpu;
+        int ret;
+        dolock = rb_ok_to_lock();
        /* yes this is racy, but if you don't like the race, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-                if (!rb_per_cpu_empty(cpu_buffer))
+                local_irq_save(flags);
+                if (dolock)
+                        spin_lock(&cpu_buffer->reader_lock);
+                ret = rb_per_cpu_empty(cpu_buffer);
+                if (dolock)
+                        spin_unlock(&cpu_buffer->reader_lock);
+                local_irq_restore(flags);
+                if (!ret)
                        return 0;
        }
@@ -2496,14 +2829,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long flags;
+        int dolock;
        int ret;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 1;
+        dolock = rb_ok_to_lock();
        cpu_buffer = buffer->buffers[cpu];
+        local_irq_save(flags);
+        if (dolock)
+                spin_lock(&cpu_buffer->reader_lock);
        ret = rb_per_cpu_empty(cpu_buffer);
+        if (dolock)
+                spin_unlock(&cpu_buffer->reader_lock);
+        local_irq_restore(flags);
        return ret;
 }
@@ -2578,28 +2920,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-                              struct buffer_data_page *bpage,
-                              unsigned int offset)
-{
-        struct ring_buffer_event *event;
-        unsigned long head;
-        __raw_spin_lock(&cpu_buffer->lock);
-        for (head = offset; head < local_read(&bpage->commit);
-             head += rb_event_length(event)) {
-                event = __rb_data_page_index(bpage, head);
-                if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-                        return;
-                /* Only count data entries */
-                if (event->type != RINGBUF_TYPE_DATA)
-                        continue;
-                cpu_buffer->entries--;
-        }
-        __raw_spin_unlock(&cpu_buffer->lock);
-}
 /**
 * ring_buffer_alloc_read_page - allocate a page to read from buffer
 * @buffer: the buffer to allocate for.
@@ -2630,6 +2950,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
        return bpage;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 /**
 * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2963,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
 {
        free_page((unsigned long)data);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 /**
 * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3090,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                /* we copied everything to the beginning */
                read = 0;
        } else {
+                /* update the entry counter */
+                cpu_buffer->read += local_read(&reader->entries);
                /* swap the pages */
                rb_init_page(bpage);
                bpage = reader->page;
                reader->page = *data_page;
                local_set(&reader->write, 0);
+                local_set(&reader->entries, 0);
                reader->read = 0;
                *data_page = bpage;
-                /* update the entry counter */
-                rb_remove_entries(cpu_buffer, bpage, read);
        }
        ret = read;
@@ -2787,7 +3110,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 out:
        return ret;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#ifdef CONFIG_TRACING
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
@@ -2845,19 +3170,17 @@ static const struct file_operations rb_simple_fops = {
 static __init int rb_init_debugfs(void)
 {
        struct dentry *d_tracer;
-        struct dentry *entry;
        d_tracer = tracing_init_dentry();
-        entry = debugfs_create_file("tracing_on", 0644, d_tracer,
+        trace_create_file("tracing_on", 0644, d_tracer,
-                                    &ring_buffer_flags, &rb_simple_fops);
+                            &ring_buffer_flags, &rb_simple_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'tracing_on' entry\n");
        return 0;
 }
 fs_initcall(rb_init_debugfs);
+#endif
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
@@ -2870,7 +3193,7 @@ static int rb_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (cpu_isset(cpu, *buffer->cpumask))
+                if (cpumask_test_cpu(cpu, buffer->cpumask))
                        return NOTIFY_OK;
                buffer->buffers[cpu] =
@@ -2881,7 +3204,7 @@ static int rb_cpu_notify(struct notifier_block *self,
                        return NOTIFY_OK;
                }
                smp_wmb();
-                cpu_set(cpu, *buffer->cpumask);
+                cpumask_set_cpu(cpu, buffer->cpumask);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 000000000000..573d3cc762c3
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,419 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+struct rb_page {
+        u64             ts;
+        local_t         commit;
+        char            data[4080];
+};
+/* run time and sleep time in seconds */
+#define RUN_TIME        10
+#define SLEEP_TIME      10
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+static int read_events;
+static int kill_test;
+#define KILL_TEST()                             \
+        do {                                    \
+                if (!kill_test) {               \
+                        kill_test = 1;          \
+                        WARN_ON(1);             \
+                }                               \
+        } while (0)
+enum event_status {
+        EVENT_FOUND,
+        EVENT_DROPPED,
+};
+static enum event_status read_event(int cpu)
+{
+        struct ring_buffer_event *event;
+        int *entry;
+        u64 ts;
+        event = ring_buffer_consume(buffer, cpu, &ts);
+        if (!event)
+                return EVENT_DROPPED;
+        entry = ring_buffer_event_data(event);
+        if (*entry != cpu) {
+                KILL_TEST();
+                return EVENT_DROPPED;
+        }
+        read++;
+        return EVENT_FOUND;
+}
+static enum event_status read_page(int cpu)
+{
+        struct ring_buffer_event *event;
+        struct rb_page *rpage;
+        unsigned long commit;
+        void *bpage;
+        int *entry;
+        int ret;
+        int inc;
+        int i;
+        bpage = ring_buffer_alloc_read_page(buffer);
+        if (!bpage)
+                return EVENT_DROPPED;
+        ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+        if (ret >= 0) {
+                rpage = bpage;
+                commit = local_read(&rpage->commit);
+                for (i = 0; i < commit && !kill_test; i += inc) {
+                        if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+                                KILL_TEST();
+                                break;
+                        }
+                        inc = -1;
+                        event = (void *)&rpage->data[i];
+                        switch (event->type_len) {
+                        case RINGBUF_TYPE_PADDING:
+                                /* failed writes may be discarded events */
+                                if (!event->time_delta)
+                                        KILL_TEST();
+                                inc = event->array[0] + 4;
+                                break;
+                        case RINGBUF_TYPE_TIME_EXTEND:
+                                inc = 8;
+                                break;
+                        case 0:
+                                entry = ring_buffer_event_data(event);
+                                if (*entry != cpu) {
+                                        KILL_TEST();
+                                        break;
+                                }
+                                read++;
+                                if (!event->array[0]) {
+                                        KILL_TEST();
+                                        break;
+                                }
+                                inc = event->array[0] + 4;
+                                break;
+                        default:
+                                entry = ring_buffer_event_data(event);
+                                if (*entry != cpu) {
+                                        KILL_TEST();
+                                        break;
+                                }
+                                read++;
+                                inc = ((event->type_len + 1) * 4);
+                        }
+                        if (kill_test)
+                                break;
+                        if (inc <= 0) {
+                                KILL_TEST();
+                                break;
+                        }
+                }
+        }
+        ring_buffer_free_read_page(buffer, bpage);
+        if (ret < 0)
+                return EVENT_DROPPED;
+        return EVENT_FOUND;
+}
+static void ring_buffer_consumer(void)
+{
+        /* toggle between reading pages and events */
+        read_events ^= 1;
+        read = 0;
+        while (!reader_finish && !kill_test) {
+                int found;
+                do {
+                        int cpu;
+                        found = 0;
+                        for_each_online_cpu(cpu) {
+                                enum event_status stat;
+                                if (read_events)
+                                        stat = read_event(cpu);
+                                else
+                                        stat = read_page(cpu);
+                                if (kill_test)
+                                        break;
+                                if (stat == EVENT_FOUND)
+                                        found = 1;
+                        }
+                } while (found && !kill_test);
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (reader_finish)
+                        break;
+                schedule();
+                __set_current_state(TASK_RUNNING);
+        }
+        reader_finish = 0;
+        complete(&read_done);
+}
+static void ring_buffer_producer(void)
+{
+        struct timeval start_tv;
+        struct timeval end_tv;
+        unsigned long long time;
+        unsigned long long entries;
+        unsigned long long overruns;
+        unsigned long missed = 0;
+        unsigned long hit = 0;
+        unsigned long avg;
+        int cnt = 0;
+        /*
+         * Hammer the buffer for 10 secs (this may
+         * make the system stall)
+         */
+        trace_printk("Starting ring buffer hammer\n");
+        do_gettimeofday(&start_tv);
+        do {
+                struct ring_buffer_event *event;
+                int *entry;
+                event = ring_buffer_lock_reserve(buffer, 10);
+                if (!event) {
+                        missed++;
+                } else {
+                        hit++;
+                        entry = ring_buffer_event_data(event);
+                        *entry = smp_processor_id();
+                        ring_buffer_unlock_commit(buffer, event);
+                }
+                do_gettimeofday(&end_tv);
+                cnt++;
+                if (consumer && !(cnt % wakeup_interval))
+                        wake_up_process(consumer);
+#ifndef CONFIG_PREEMPT
+                /*
+                 * If we are a non preempt kernel, the 10 second run will
+                 * stop everything while it runs. Instead, we will call
+                 * cond_resched and also add any time that was lost by a
+                 * rescedule.
+                 *
+                 * Do a cond resched at the same frequency we would wake up
+                 * the reader.
+                 */
+                if (cnt % wakeup_interval)
+                        cond_resched();
+#endif
+        } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+        trace_printk("End ring buffer hammer\n");
+        if (consumer) {
+                /* Init both completions here to avoid races */
+                init_completion(&read_start);
+                init_completion(&read_done);
+                /* the completions must be visible before the finish var */
+                smp_wmb();
+                reader_finish = 1;
+                /* finish var visible before waking up the consumer */
+                smp_wmb();
+                wake_up_process(consumer);
+                wait_for_completion(&read_done);
+        }
+        time = end_tv.tv_sec - start_tv.tv_sec;
+        time *= USEC_PER_SEC;
+        time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+        entries = ring_buffer_entries(buffer);
+        overruns = ring_buffer_overruns(buffer);
+        if (kill_test)
+                trace_printk("ERROR!\n");
+        trace_printk("Time:     %lld (usecs)\n", time);
+        trace_printk("Overruns: %lld\n", overruns);
+        if (disable_reader)
+                trace_printk("Read:     (reader disabled)\n");
+        else
+                trace_printk("Read:     %ld  (by %s)\n", read,
+                        read_events ? "events" : "pages");
+        trace_printk("Entries:  %lld\n", entries);
+        trace_printk("Total:    %lld\n", entries + overruns + read);
+        trace_printk("Missed:   %ld\n", missed);
+        trace_printk("Hit:      %ld\n", hit);
+        /* Convert time from usecs to millisecs */
+        do_div(time, USEC_PER_MSEC);
+        if (time)
+                hit /= (long)time;
+        else
+                trace_printk("TIME IS ZERO??\n");
+        trace_printk("Entries per millisec: %ld\n", hit);
+        if (hit) {
+                /* Calculate the average time in nanosecs */
+                avg = NSEC_PER_MSEC / hit;
+                trace_printk("%ld ns per entry\n", avg);
+        }
+        if (missed) {
+                if (time)
+                        missed /= (long)time;
+                trace_printk("Total iterations per millisec: %ld\n",
+                             hit + missed);
+                /* it is possible that hit + missed will overflow and be zero */
+                if (!(hit + missed)) {
+                        trace_printk("hit + missed overflowed and totalled zero!\n");
+                        hit--; /* make it non zero */
+                }
+                /* Caculate the average time in nanosecs */
+                avg = NSEC_PER_MSEC / (hit + missed);
+                trace_printk("%ld ns per entry\n", avg);
+        }
+}
+static void wait_to_die(void)
+{
+        set_current_state(TASK_INTERRUPTIBLE);
+        while (!kthread_should_stop()) {
+                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
+        __set_current_state(TASK_RUNNING);
+}
+static int ring_buffer_consumer_thread(void *arg)
+{
+        while (!kthread_should_stop() && !kill_test) {
+                complete(&read_start);
+                ring_buffer_consumer();
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (kthread_should_stop() || kill_test)
+                        break;
+                schedule();
+                __set_current_state(TASK_RUNNING);
+        }
+        __set_current_state(TASK_RUNNING);
+        if (kill_test)
+                wait_to_die();
+        return 0;
+}
+static int ring_buffer_producer_thread(void *arg)
+{
+        init_completion(&read_start);
+        while (!kthread_should_stop() && !kill_test) {
+                ring_buffer_reset(buffer);
+                if (consumer) {
+                        smp_wmb();
+                        wake_up_process(consumer);
+                        wait_for_completion(&read_start);
+                }
+                ring_buffer_producer();
+                trace_printk("Sleeping for 10 secs\n");
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(HZ * SLEEP_TIME);
+                __set_current_state(TASK_RUNNING);
+        }
+        if (kill_test)
+                wait_to_die();
+        return 0;
+}
+static int __init ring_buffer_benchmark_init(void)
+{
+        int ret;
+        /* make a one meg buffer in overwite mode */
+        buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+        if (!buffer)
+                return -ENOMEM;
+        if (!disable_reader) {
+                consumer = kthread_create(ring_buffer_consumer_thread,
+                                          NULL, "rb_consumer");
+                ret = PTR_ERR(consumer);
+                if (IS_ERR(consumer))
+                        goto out_fail;
+        }
+        producer = kthread_run(ring_buffer_producer_thread,
+                               NULL, "rb_producer");
+        ret = PTR_ERR(producer);
+        if (IS_ERR(producer))
+                goto out_kill;
+        return 0;
+ out_kill:
+        if (consumer)
+                kthread_stop(consumer);
+ out_fail:
+        ring_buffer_free(buffer);
+        return ret;
+}
+static void __exit ring_buffer_benchmark_exit(void)
+{
+        kthread_stop(producer);
+        if (consumer)
+                kthread_stop(consumer);
+        ring_buffer_free(buffer);
+}
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cda81ec58d9f..8c358395d338 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
@@ -171,6 +172,13 @@ static struct trace_array	global_trace;
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+                                 struct ring_buffer_event *event)
+{
+        return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
 cycle_t ftrace_now(int cpu)
 {
        u64 ts;
@@ -255,7 +263,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+        TRACE_ITER_GRAPH_TIME;
 /**
 * trace_wake_up - wake up tasks waiting for trace input
@@ -276,13 +285,12 @@ void trace_wake_up(void)
 static int __init set_buf_size(char *str)
 {
        unsigned long buf_size;
-        int ret;
        if (!str)
                return 0;
-        ret = strict_strtoul(str, 0, &buf_size);
+        buf_size = memparse(str, &str);
        /* nr_entries can not be zero */
-        if (ret < 0 || buf_size == 0)
+        if (buf_size == 0)
                return 0;
        trace_buf_size = buf_size;
        return 1;
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
        "latency-format",
        "global-clock",
        "sleep-time",
+        "graph-time",
        NULL
 };
@@ -335,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
 /*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
 */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
        return cnt;
 }
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-        s->buffer[len] = 0;
-        seq_puts(m, s->buffer);
-        trace_seq_init(s);
-}
 /**
 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
 * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
                tracing_reset(tr, cpu);
 }
+void tracing_reset_current(int cpu)
+{
+        tracing_reset(&global_trace, cpu);
+}
+void tracing_reset_current_online_cpus(void)
+{
+        tracing_reset_online_cpus(&global_trace);
+}
 #define SAVED_CMDLINES 128
 #define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        preempt_disable();
        __raw_spin_lock(&trace_cmdline_lock);
        map = map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
                strcpy(comm, "<...>");
        __raw_spin_unlock(&trace_cmdline_lock);
+        preempt_enable();
 }
 void tracing_record_cmdline(struct task_struct *tsk)
@@ -838,9 +848,10 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
                ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
                (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-                                                    unsigned char type,
+                                                    int type,
                                                    unsigned long len,
                                                    unsigned long flags, int pc)
 {
@@ -883,30 +894,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
                                  unsigned long flags, int pc)
 {
        return trace_buffer_lock_reserve(&global_trace,
                                         type, len, flags, pc);
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
                                        unsigned long flags, int pc)
 {
-        return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+        __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
                                        unsigned long flags, int pc)
 {
-        return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+        __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+        ring_buffer_discard_commit(global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 void
 trace_function(struct trace_array *tr,
               unsigned long ip, unsigned long parent_ip, unsigned long flags,
               int pc)
 {
+        struct ftrace_event_call *call = &event_function;
        struct ring_buffer_event *event;
        struct ftrace_entry *entry;
@@ -921,7 +942,9 @@ trace_function(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
 }
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +953,7 @@ static int __trace_graph_entry(struct trace_array *tr,
                                unsigned long flags,
                                int pc)
 {
+        struct ftrace_event_call *call = &event_funcgraph_entry;
        struct ring_buffer_event *event;
        struct ftrace_graph_ent_entry *entry;
@@ -942,7 +966,8 @@ static int __trace_graph_entry(struct trace_array *tr,
                return 0;
        entry   = ring_buffer_event_data(event);
        entry->graph_ent                        = *trace;
-        ring_buffer_unlock_commit(global_trace.buffer, event);
+        if (!filter_current_check_discard(call, entry, event))
+                ring_buffer_unlock_commit(global_trace.buffer, event);
        return 1;
 }
@@ -952,6 +977,7 @@ static void __trace_graph_return(struct trace_array *tr,
                                unsigned long flags,
                                int pc)
 {
+        struct ftrace_event_call *call = &event_funcgraph_exit;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *entry;
@@ -964,7 +990,8 @@ static void __trace_graph_return(struct trace_array *tr,
                return;
        entry   = ring_buffer_event_data(event);
        entry->ret                              = *trace;
-        ring_buffer_unlock_commit(global_trace.buffer, event);
+        if (!filter_current_check_discard(call, entry, event))
+                ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
@@ -982,6 +1009,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
                                 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+        struct ftrace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        struct stack_entry *entry;
        struct stack_trace trace;
@@ -999,7 +1027,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
        trace.entries           = entry->caller;
        save_stack_trace(&trace);
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1024,6 +1053,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
                                   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+        struct ftrace_event_call *call = &event_user_stack;
        struct ring_buffer_event *event;
        struct userstack_entry *entry;
        struct stack_trace trace;
@@ -1045,7 +1075,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
        trace.entries           = entry->caller;
        save_stack_trace_user(&trace);
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1089,6 +1120,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
                           struct task_struct *next,
                           unsigned long flags, int pc)
 {
+        struct ftrace_event_call *call = &event_context_switch;
        struct ring_buffer_event *event;
        struct ctx_switch_entry *entry;
@@ -1104,7 +1136,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry->next_prio                = next->prio;
        entry->next_state               = next->state;
        entry->next_cpu = task_cpu(next);
-        trace_buffer_unlock_commit(tr, event, flags, pc);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 void
@@ -1113,6 +1147,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
                           struct task_struct *curr,
                           unsigned long flags, int pc)
 {
+        struct ftrace_event_call *call = &event_wakeup;
        struct ring_buffer_event *event;
        struct ctx_switch_entry *entry;
@@ -1129,7 +1164,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->next_state               = wakee->state;
        entry->next_cpu                 = task_cpu(wakee);
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
        ftrace_trace_stack(tr, flags, 6, pc);
        ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1230,11 +1266,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        static u32 trace_buf[TRACE_BUF_SIZE];
+        struct ftrace_event_call *call = &event_bprint;
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
        struct bprint_entry *entry;
        unsigned long flags;
+        int disable;
        int resched;
        int cpu, len = 0, size, pc;
@@ -1249,7 +1287,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
-        if (unlikely(atomic_read(&data->disabled)))
+        disable = atomic_inc_return(&data->disabled);
+        if (unlikely(disable != 1))
                goto out;
        /* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1308,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
 out_unlock:
        __raw_spin_unlock(&trace_buf_lock);
        local_irq_restore(flags);
 out:
+        atomic_dec_return(&data->disabled);
        ftrace_preempt_enable(resched);
        unpause_graph_tracing();
@@ -1288,12 +1329,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
        static char trace_buf[TRACE_BUF_SIZE];
+        struct ftrace_event_call *call = &event_print;
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
        int cpu, len = 0, size, pc;
        struct print_entry *entry;
        unsigned long irq_flags;
+        int disable;
        if (tracing_disabled || tracing_selftest_running)
                return 0;
@@ -1303,7 +1346,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
-        if (unlikely(atomic_read(&data->disabled)))
+        disable = atomic_inc_return(&data->disabled);
+        if (unlikely(disable != 1))
                goto out;
        pause_graph_tracing();
@@ -1323,13 +1367,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = 0;
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
 out_unlock:
        __raw_spin_unlock(&trace_buf_lock);
        raw_local_irq_restore(irq_flags);
        unpause_graph_tracing();
 out:
+        atomic_dec_return(&data->disabled);
        preempt_enable_notrace();
        return len;
@@ -1526,12 +1572,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                p = s_next(m, p, &l);
        }
+        trace_event_read_lock();
        return p;
 }
 static void s_stop(struct seq_file *m, void *p)
 {
        atomic_dec(&trace_record_cmdline_disabled);
+        trace_event_read_unlock();
 }
 static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1822,7 @@ static int trace_empty(struct trace_iterator *iter)
        return 1;
 }
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
@@ -1983,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
        /* If this file was open for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) &&
-            !(file->f_flags & O_APPEND)) {
+            (file->f_flags & O_TRUNC)) {
                long cpu = (long) inode->i_private;
                if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2005,25 +2054,23 @@ static int tracing_open(struct inode *inode, struct file *file)
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct tracer *t = m->private;
+        struct tracer *t = v;
        (*pos)++;
        if (t)
                t = t->next;
-        m->private = t;
        return t;
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        struct tracer *t = m->private;
+        struct tracer *t;
        loff_t l = 0;
        mutex_lock(&trace_types_lock);
-        for (; t && l < *pos; t = t_next(m, t, &l))
+        for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
                ;
        return t;
@@ -2059,18 +2106,10 @@ static struct seq_operations show_traces_seq_ops = {
 static int show_traces_open(struct inode *inode, struct file *file)
 {
-        int ret;
        if (tracing_disabled)
                return -ENODEV;
-        ret = seq_open(file, &show_traces_seq_ops);
+        return seq_open(file, &show_traces_seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = trace_types;
-        }
-        return ret;
 }
 static ssize_t
@@ -2143,11 +2182,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
                return -ENOMEM;
-        mutex_lock(&tracing_cpumask_update_lock);
        err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
        if (err)
                goto err_unlock;
+        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
        __raw_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
@@ -2175,8 +2215,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        return count;
 err_unlock:
-        mutex_unlock(&tracing_cpumask_update_lock);
+        free_cpumask_var(tracing_cpumask_new);
-        free_cpumask_var(tracing_cpumask);
        return err;
 }
@@ -2366,21 +2405,20 @@ static const struct file_operations tracing_iter_fops = {
 static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
-        "# mkdir /debug\n"
+        "# mount -t debugfs nodev /sys/kernel/debug\n\n"
-        "# mount -t debugfs nodev /debug\n\n"
+        "# cat /sys/kernel/debug/tracing/available_tracers\n"
-        "# cat /debug/tracing/available_tracers\n"
        "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
-        "# cat /debug/tracing/current_tracer\n"
+        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "nop\n"
-        "# echo sched_switch > /debug/tracing/current_tracer\n"
+        "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
-        "# cat /debug/tracing/current_tracer\n"
+        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "sched_switch\n"
-        "# cat /debug/tracing/trace_options\n"
+        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
-        "# echo print-parent > /debug/tracing/trace_options\n"
+        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-        "# echo 1 > /debug/tracing/tracing_enabled\n"
+        "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
-        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
+        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-        "# echo 0 > /debug/tracing/tracing_enabled\n"
+        "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
 ;
 static ssize_t
@@ -2397,6 +2435,56 @@ static const struct file_operations tracing_readme_fops = {
 };
 static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+                                size_t cnt, loff_t *ppos)
+{
+        char *buf_comm;
+        char *file_buf;
+        char *buf;
+        int len = 0;
+        int pid;
+        int i;
+        file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+        if (!file_buf)
+                return -ENOMEM;
+        buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+        if (!buf_comm) {
+                kfree(file_buf);
+                return -ENOMEM;
+        }
+        buf = file_buf;
+        for (i = 0; i < SAVED_CMDLINES; i++) {
+                int r;
+                pid = map_cmdline_to_pid[i];
+                if (pid == -1 || pid == NO_CMDLINE_MAP)
+                        continue;
+                trace_find_cmdline(pid, buf_comm);
+                r = sprintf(buf, "%d %s\n", pid, buf_comm);
+                buf += r;
+                len += r;
+        }
+        len = simple_read_from_buffer(ubuf, cnt, ppos,
+                                      file_buf, len);
+        kfree(file_buf);
+        kfree(buf_comm);
+        return len;
+}
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
@@ -2728,6 +2816,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        /* trace pipe does not show start of buffer */
        cpumask_setall(iter->started);
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                iter->iter_flags |= TRACE_FILE_LAT_FMT;
        iter->cpu_file = cpu_file;
        iter->tr = &global_trace;
        mutex_init(&iter->mutex);
@@ -2915,6 +3006,7 @@ waitagain:
               offsetof(struct trace_iterator, seq));
        iter->pos = -1;
+        trace_event_read_lock();
        while (find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -2931,6 +3023,7 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
        }
+        trace_event_read_unlock();
        /* Now copy what we have to the user */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -2993,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
                        break;
                }
-                trace_consume(iter);
+                if (ret != TRACE_TYPE_NO_CONSUME)
+                        trace_consume(iter);
                rem -= count;
                if (!find_next_entry_inc(iter)) {
                        rem = 0;
@@ -3053,6 +3147,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                goto out_err;
        }
+        trace_event_read_lock();
        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
                pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3171,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                trace_seq_init(&iter->seq);
        }
+        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);
        spd.nr_pages = i;
@@ -3425,7 +3522,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                .spd_release    = buffer_spd_release,
        };
        struct buffer_ref *ref;
-        int size, i;
+        int entries, size, i;
        size_t ret;
        if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3537,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
-        for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+        entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+        for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
                struct page *page;
                int r;
@@ -3457,7 +3556,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                }
                r = ring_buffer_read_page(ref->buffer, &ref->page,
-                                          len, info->cpu, 0);
+                                          len, info->cpu, 1);
                if (r < 0) {
                        ring_buffer_free_read_page(ref->buffer,
                                                   ref->page);
@@ -3481,6 +3580,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                spd.partial[i].private = (unsigned long)ref;
                spd.nr_pages++;
                *ppos += PAGE_SIZE;
+                entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        }
        spd.nr_pages = i;
@@ -3508,6 +3609,45 @@ static const struct file_operations tracing_buffers_fops = {
        .llseek         = no_llseek,
 };
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+                   size_t count, loff_t *ppos)
+{
+        unsigned long cpu = (unsigned long)filp->private_data;
+        struct trace_array *tr = &global_trace;
+        struct trace_seq *s;
+        unsigned long cnt;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                return ENOMEM;
+        trace_seq_init(s);
+        cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "entries: %ld\n", cnt);
+        cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "overrun: %ld\n", cnt);
+        cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+        cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+        kfree(s);
+        return count;
+}
+static const struct file_operations tracing_stats_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_stats_read,
+};
 #ifdef CONFIG_DYNAMIC_FTRACE
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3737,7 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu();
-        struct dentry *entry, *d_cpu;
+        struct dentry *d_cpu;
        /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
        char cpu_dir[7];
@@ -3612,21 +3752,18 @@ static void tracing_init_debugfs_percpu(long cpu)
        }
        /* per cpu trace_pipe */
-        entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
+        trace_create_file("trace_pipe", 0444, d_cpu,
-                                (void *) cpu, &tracing_pipe_fops);
+                        (void *) cpu, &tracing_pipe_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'trace_pipe' entry\n");
        /* per cpu trace */
-        entry = debugfs_create_file("trace", 0644, d_cpu,
+        trace_create_file("trace", 0644, d_cpu,
-                                (void *) cpu, &tracing_fops);
+                        (void *) cpu, &tracing_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'trace' entry\n");
-        entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
+        trace_create_file("trace_pipe_raw", 0444, d_cpu,
-                                    (void *) cpu, &tracing_buffers_fops);
+                        (void *) cpu, &tracing_buffers_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+        trace_create_file("stats", 0444, d_cpu,
+                        (void *) cpu, &tracing_stats_fops);
 }
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3759,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (ret < 0)
                return ret;
-        switch (val) {
+        if (val != 0 && val != 1)
-        case 0:
-                trace_flags &= ~(1 << index);
-                break;
-        case 1:
-                trace_flags |= 1 << index;
-                break;
-        default:
                return -EINVAL;
-        }
+        set_tracer_flags(1 << index, val);
        *ppos += cnt;
@@ -3782,6 +3911,22 @@ static const struct file_operations trace_options_core_fops = {
        .write = trace_options_core_write,
 };
+struct dentry *trace_create_file(const char *name,
+                                 mode_t mode,
+                                 struct dentry *parent,
+                                 void *data,
+                                 const struct file_operations *fops)
+{
+        struct dentry *ret;
+        ret = debugfs_create_file(name, mode, parent, data, fops);
+        if (!ret)
+                pr_warning("Could not create debugfs '%s' entry\n", name);
+        return ret;
+}
 static struct dentry *trace_options_init_dentry(void)
 {
        struct dentry *d_tracer;
@@ -3809,7 +3954,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
                         struct tracer_opt *opt)
 {
        struct dentry *t_options;
-        struct dentry *entry;
        t_options = trace_options_init_dentry();
        if (!t_options)
@@ -3818,11 +3962,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
        topt->flags = flags;
        topt->opt = opt;
-        entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+        topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
                                    &trace_options_fops);
-        topt->entry = entry;
 }
 static struct trace_option_dentry *
@@ -3877,123 +4019,84 @@ static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
        struct dentry *t_options;
-        struct dentry *entry;
        t_options = trace_options_init_dentry();
        if (!t_options)
                return NULL;
-        entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+        return trace_create_file(option, 0644, t_options, (void *)index,
                                    &trace_options_core_fops);
-        return entry;
 }
 static __init void create_trace_options_dir(void)
 {
        struct dentry *t_options;
-        struct dentry *entry;
        int i;
        t_options = trace_options_init_dentry();
        if (!t_options)
                return;
-        for (i = 0; trace_options[i]; i++) {
+        for (i = 0; trace_options[i]; i++)
-                entry = create_trace_option_core_file(trace_options[i], i);
+                create_trace_option_core_file(trace_options[i], i);
-                if (!entry)
-                        pr_warning("Could not create debugfs %s entry\n",
-                                   trace_options[i]);
-        }
 }
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
-        struct dentry *entry;
        int cpu;
        d_tracer = tracing_init_dentry();
-        entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+        trace_create_file("tracing_enabled", 0644, d_tracer,
-                                    &global_trace, &tracing_ctrl_fops);
+                        &global_trace, &tracing_ctrl_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
-        entry = debugfs_create_file("trace_options", 0644, d_tracer,
+        trace_create_file("trace_options", 0644, d_tracer,
-                                    NULL, &tracing_iter_fops);
+                        NULL, &tracing_iter_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'trace_options' entry\n");
-        create_trace_options_dir();
+        trace_create_file("tracing_cpumask", 0644, d_tracer,
+                        NULL, &tracing_cpumask_fops);
+        trace_create_file("trace", 0644, d_tracer,
+                        (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+        trace_create_file("available_tracers", 0444, d_tracer,
+                        &global_trace, &show_traces_fops);
+        trace_create_file("current_tracer", 0644, d_tracer,
+                        &global_trace, &set_tracer_fops);
-        entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+        trace_create_file("tracing_max_latency", 0644, d_tracer,
-                                    NULL, &tracing_cpumask_fops);
+                        &tracing_max_latency, &tracing_max_lat_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+        trace_create_file("tracing_thresh", 0644, d_tracer,
+                        &tracing_thresh, &tracing_max_lat_fops);
-        entry = debugfs_create_file("trace", 0644, d_tracer,
-                                 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+        trace_create_file("README", 0444, d_tracer,
-        if (!entry)
+                        NULL, &tracing_readme_fops);
-                pr_warning("Could not create debugfs 'trace' entry\n");
+        trace_create_file("trace_pipe", 0444, d_tracer,
-        entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-                                    &global_trace, &show_traces_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'available_tracers' entry\n");
-        entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-                                    &global_trace, &set_tracer_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'current_tracer' entry\n");
-        entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-                                    &tracing_max_latency,
-                                    &tracing_max_lat_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'tracing_max_latency' entry\n");
-        entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-                                    &tracing_thresh, &tracing_max_lat_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'tracing_thresh' entry\n");
-        entry = debugfs_create_file("README", 0644, d_tracer,
-                                    NULL, &tracing_readme_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'README' entry\n");
-        entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
                        (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
+        trace_create_file("buffer_size_kb", 0644, d_tracer,
-                           "'trace_pipe' entry\n");
+                        &global_trace, &tracing_entries_fops);
-        entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
+        trace_create_file("trace_marker", 0220, d_tracer,
-                                    &global_trace, &tracing_entries_fops);
+                        NULL, &tracing_mark_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
+        trace_create_file("saved_cmdlines", 0444, d_tracer,
-                           "'buffer_size_kb' entry\n");
+                        NULL, &tracing_saved_cmdlines_fops);
-        entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-                                    NULL, &tracing_mark_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'trace_marker' entry\n");
 #ifdef CONFIG_DYNAMIC_FTRACE
-        entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-                                    &ftrace_update_tot_cnt,
+                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
-                                    &tracing_dyn_info_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'dyn_ftrace_total_info' entry\n");
 #endif
 #ifdef CONFIG_SYSPROF_TRACER
        init_tracer_sysprof_debugfs(d_tracer);
 #endif
+        create_trace_options_dir();
        for_each_tracing_cpu(cpu)
                tracing_init_debugfs_percpu(cpu);
@@ -4064,7 +4167,8 @@ trace_printk_seq(struct trace_seq *s)
 static void __ftrace_dump(bool disable_tracing)
 {
-        static DEFINE_SPINLOCK(ftrace_dump_lock);
+        static raw_spinlock_t ftrace_dump_lock =
+                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        unsigned int old_userobj;
@@ -4073,7 +4177,8 @@ static void __ftrace_dump(bool disable_tracing)
        int cnt = 0, cpu;
        /* only one dump */
-        spin_lock_irqsave(&ftrace_dump_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&ftrace_dump_lock);
        if (dump_ran)
                goto out;
@@ -4122,8 +4227,11 @@ static void __ftrace_dump(bool disable_tracing)
                iter.pos = -1;
                if (find_next_entry_inc(&iter) != NULL) {
-                        print_trace_line(&iter);
+                        int ret;
-                        trace_consume(&iter);
+                        ret = print_trace_line(&iter);
+                        if (ret != TRACE_TYPE_NO_CONSUME)
+                                trace_consume(&iter);
                }
                trace_printk_seq(&iter.seq);
@@ -4145,7 +4253,8 @@ static void __ftrace_dump(bool disable_tracing)
        }
 out:
-        spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+        __raw_spin_unlock(&ftrace_dump_lock);
+        local_irq_restore(flags);
 }
 /* By default: disable tracing after the dump */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e685ac2b2ba1..8b9f4f6e9559 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,9 +9,12 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
 enum trace_type {
        __TRACE_FIRST_TYPE = 0,
@@ -42,20 +45,6 @@ enum trace_type {
 };
 /*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-        unsigned char           type;
-        unsigned char           flags;
-        unsigned char           preempt_count;
-        int                     pid;
-        int                     tgid;
-};
-/*
 * Function trace entry - function address and parent function addres:
 */
 struct ftrace_entry {
@@ -263,8 +252,6 @@ struct trace_array_cpu {
        char                    comm[TASK_COMM_LEN];
 };
-struct trace_iterator;
 /*
 * The trace array - an array of per-CPU trace arrays. This is the
 * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
                __ftrace_bad_type();                                    \
        } while (0)
-/* Return values for print_line callback */
-enum print_line_t {
-        TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
-        TRACE_TYPE_HANDLED      = 1,
-        TRACE_TYPE_UNHANDLED    = 2,    /* Relay to other output functions */
-        TRACE_TYPE_NO_CONSUME   = 3     /* Handled but ask to not consume */
-};
 /*
 * An option specific to a tracer. This is a boolean value.
 * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
        struct tracer_stat      *stats;
 };
-struct trace_seq {
-        unsigned char           buffer[PAGE_SIZE];
-        unsigned int            len;
-        unsigned int            readpos;
-};
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-        s->len = 0;
-        s->readpos = 0;
-}
 #define TRACE_PIPE_ALL_CPU      -1
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-        struct trace_array      *tr;
-        struct tracer           *trace;
-        void                    *private;
-        int                     cpu_file;
-        struct mutex            mutex;
-        struct ring_buffer_iter *buffer_iter[NR_CPUS];
-        /* The below is zeroed out in pipe_read */
-        struct trace_seq        seq;
-        struct trace_entry      *ent;
-        int                     cpu;
-        u64                     ts;
-        unsigned long           iter_flags;
-        loff_t                  pos;
-        long                    idx;
-        cpumask_var_t           started;
-};
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+                                 mode_t mode,
+                                 struct dentry *parent,
+                                 void *data,
+                                 const struct file_operations *fops);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 struct ring_buffer_event;
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-                                                    unsigned char type,
+                                                    int type,
                                                    unsigned long len,
                                                    unsigned long flags,
                                                    int pc);
@@ -484,24 +432,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
                                struct ring_buffer_event *event,
                                unsigned long flags, int pc);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-                                  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-                                        unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-                                        unsigned long flags, int pc);
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
                                                struct trace_array_cpu *data);
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts);
-void tracing_generic_entry_update(struct trace_entry *entry,
-                                  unsigned long flags,
-                                  int pc);
 void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
@@ -514,7 +450,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
                                struct task_struct *prev,
                                struct task_struct *next,
                                unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 void tracing_sched_wakeup_trace(struct trace_array *tr,
                                struct task_struct *wakee,
@@ -599,6 +534,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
                                               struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+                                              struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +550,8 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -644,7 +583,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
        return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
@@ -655,6 +593,7 @@ print_graph_function(struct trace_iterator *iter)
 extern struct pid *ftrace_pid_trace;
+#ifdef CONFIG_FUNCTION_TRACER
 static inline int ftrace_trace_task(struct task_struct *task)
 {
        if (!ftrace_pid_trace)
@@ -662,6 +601,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
        return test_tsk_trace_trace(task);
 }
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+        return 1;
+}
+#endif
 /*
 * trace_iterator_flags is an enumeration that defines bit
@@ -692,6 +637,7 @@ enum trace_iterator_flags {
        TRACE_ITER_LATENCY_FMT          = 0x40000,
        TRACE_ITER_GLOBAL_CLK           = 0x80000,
        TRACE_ITER_SLEEP_TIME           = 0x100000,
+        TRACE_ITER_GRAPH_TIME           = 0x200000,
 };
 /*
@@ -790,103 +736,113 @@ struct ftrace_event_field {
        char                    *type;
        int                     offset;
        int                     size;
+        int                     is_signed;
 };
-struct ftrace_event_call {
+struct event_filter {
-        char                    *name;
+        int                     n_preds;
-        char                    *system;
-        struct dentry           *dir;
-        int                     enabled;
-        int                     (*regfunc)(void);
-        void                    (*unregfunc)(void);
-        int                     id;
-        int                     (*raw_init)(void);
-        int                     (*show_format)(struct trace_seq *s);
-        int                     (*define_fields)(void);
-        struct list_head        fields;
        struct filter_pred      **preds;
+        char                    *filter_string;
-#ifdef CONFIG_EVENT_PROFILE
-        atomic_t        profile_count;
-        int             (*profile_enable)(struct ftrace_event_call *);
-        void            (*profile_disable)(struct ftrace_event_call *);
-#endif
 };
 struct event_subsystem {
        struct list_head        list;
        const char              *name;
        struct dentry           *entry;
-        struct filter_pred      **preds;
+        void                    *filter;
 };
-#define events_for_each(event)                                          \
-        for (event = __start_ftrace_events;                             \
-             (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-             event++)
-#define MAX_FILTER_PRED 8
 struct filter_pred;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+                                 int val1, int val2);
 struct filter_pred {
        filter_pred_fn_t fn;
        u64 val;
-        char *str_val;
+        char str_val[MAX_FILTER_STR_VAL];
        int str_len;
        char *field_name;
        int offset;
        int not;
-        int or;
+        int op;
-        int compound;
+        int pop_n;
-        int clear;
 };
-int trace_define_field(struct ftrace_event_call *call, char *type,
+extern void print_event_filter(struct ftrace_event_call *call,
-                       char *name, int offset, int size);
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
                               struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
+extern int apply_event_filter(struct ftrace_event_call *call,
-extern int filter_add_pred(struct ftrace_event_call *call,
+                              char *filter_string);
-                           struct filter_pred *pred);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
-extern void filter_free_preds(struct ftrace_event_call *call);
+                                        char *filter_string);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
+                                         struct trace_seq *s);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-                                     struct filter_pred *pred);
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
+                     struct ring_buffer *buffer,
-extern struct ftrace_event_call __start_ftrace_events[];
+                     struct ring_buffer_event *event)
-extern struct ftrace_event_call __stop_ftrace_events[];
+{
+        if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
-#define for_each_event(event)                                           \
+                ring_buffer_discard_commit(buffer, event);
-        for (event = __start_ftrace_events;                             \
+                return 1;
-             (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+        }
-             event++)
+        return 0;
+}
+#define DEFINE_COMPARISON_PRED(type)                                    \
+static int filter_pred_##type(struct filter_pred *pred, void *event,    \
+                              int val1, int val2)                       \
+{                                                                       \
+        type *addr = (type *)(event + pred->offset);                    \
+        type val = (type)pred->val;                                     \
+        int match = 0;                                                  \
+                                                                        \
+        switch (pred->op) {                                             \
+        case OP_LT:                                                     \
+                match = (*addr < val);                                  \
+                break;                                                  \
+        case OP_LE:                                                     \
+                match = (*addr <= val);                                 \
+                break;                                                  \
+        case OP_GT:                                                     \
+                match = (*addr > val);                                  \
+                break;                                                  \
+        case OP_GE:                                                     \
+                match = (*addr >= val);                                 \
+                break;                                                  \
+        default:                                                        \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        return match;                                                   \
+}
+#define DEFINE_EQUALITY_PRED(size)                                      \
+static int filter_pred_##size(struct filter_pred *pred, void *event,    \
+                              int val1, int val2)                       \
+{                                                                       \
+        u##size *addr = (u##size *)(event + pred->offset);              \
+        u##size val = (u##size)pred->val;                               \
+        int match;                                                      \
+                                                                        \
+        match = (val == *addr) ^ pred->not;                             \
+                                                                        \
+        return match;                                                   \
+}
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
-/*
+#undef TRACE_EVENT_FORMAT
- * The double __builtin_constant_p is because gcc will give us an error
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)      \
- * if we try to allocate the static variable to fmt if it is not a
+        extern struct ftrace_event_call event_##call;
- * constant. Even with the outer if statement optimizing out.
+#undef TRACE_EVENT_FORMAT_NOFILTER
- */
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
-#define event_trace_printk(ip, fmt, args...)                            \
+#include "trace_event_types.h"
-do {                                                                    \
-        __trace_printk_check_format(fmt, ##args);                       \
-        tracing_record_cmdline(current);                                \
-        if (__builtin_constant_p(fmt)) {                                \
-                static const char *trace_printk_fmt                     \
-                  __attribute__((section("__trace_printk_fmt"))) =      \
-                        __builtin_constant_p(fmt) ? fmt : NULL;         \
-                                                                        \
-                __trace_bprintk(ip, trace_printk_fmt, ##args);          \
-        } else                                                          \
-                __trace_printk(ip, fmt, ##args);                        \
-} while (0)
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c3642..a29ef23ffb47 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/time.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
        trace_assign_type(field, entry);
        call = &field->boot_call;
        ts = iter->ts;
-        nsec_rem = do_div(ts, 1000000000);
+        nsec_rem = do_div(ts, NSEC_PER_SEC);
        ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
                        (unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
        trace_assign_type(field, entry);
        init_ret = &field->boot_ret;
        ts = iter->ts;
-        nsec_rem = do_div(ts, 1000000000);
+        nsec_rem = do_div(ts, NSEC_PER_SEC);
        ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
                        "returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8333715e4066..7a7a9fd249a9 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
+        struct ftrace_event_call *call = &event_branch;
        struct trace_array *tr = branch_tracer;
        struct ring_buffer_event *event;
        struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        entry->line = f->line;
        entry->correct = val == expect;
-        ring_buffer_unlock_commit(tr->buffer, event);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                ring_buffer_unlock_commit(tr->buffer, event);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
        return 0;
 }
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
 {
        return __start_annotated_branch_profile;
 }
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
        return 0;
 }
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
 {
        return __start_branch_profile;
 }
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba9970776..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,22 +10,30 @@
 int ftrace_profile_enable(int event_id)
 {
        struct ftrace_event_call *event;
+        int ret = -EINVAL;
-        for_each_event(event) {
+        mutex_lock(&event_mutex);
-                if (event->id == event_id)
+        list_for_each_entry(event, &ftrace_events, list) {
-                        return event->profile_enable(event);
+                if (event->id == event_id && event->profile_enable) {
+                        ret = event->profile_enable(event);
+                        break;
+                }
        }
+        mutex_unlock(&event_mutex);
-        return -EINVAL;
+        return ret;
 }
 void ftrace_profile_disable(int event_id)
 {
        struct ftrace_event_call *event;
-        for_each_event(event) {
+        mutex_lock(&event_mutex);
-                if (event->id == event_id)
+        list_for_each_entry(event, &ftrace_events, list) {
-                        return event->profile_disable(event);
+                if (event->id == event_id) {
+                        event->profile_disable(event);
+                        break;
+                }
        }
+        mutex_unlock(&event_mutex);
 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd7..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
                   ftrace_graph_ret_entry, ignore,
        TRACE_STRUCT(
                TRACE_FIELD(unsigned long, ret.func, func)
+                TRACE_FIELD(unsigned long long, ret.calltime, calltime)
+                TRACE_FIELD(unsigned long long, ret.rettime, rettime)
+                TRACE_FIELD(unsigned long, ret.overrun, overrun)
                TRACE_FIELD(int, ret.depth, depth)
        ),
        TP_RAW_FMT("<-- %lx (%d)")
@@ -57,7 +60,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
        TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
        TRACE_STRUCT(
                TRACE_FIELD(unsigned long, arg1, arg1)
                TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +125,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
        TRACE_STRUCT(
                TRACE_FIELD(unsigned int, line, line)
-                TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
+                TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
-                TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+                                    TRACE_FUNC_SIZE+1, func)
+                TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+                                    TRACE_FUNC_SIZE+1, file)
                TRACE_FIELD(char, correct, correct)
        ),
        TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +144,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
        TRACE_STRUCT(
-                TRACE_FIELD(ktime_t, state_data.stamp, stamp)
+                TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
-                TRACE_FIELD(ktime_t, state_data.end, end)
+                TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
                TRACE_FIELD(int, state_data.type, type)
                TRACE_FIELD(int, state_data.state, state)
        ),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 576f4fa2af0d..e75276a49cf5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,19 +8,25 @@
 *
 */
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/delay.h>
 #include "trace_output.h"
 #define TRACE_SYSTEM "TRACE_SYSTEM"
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
+LIST_HEAD(ftrace_events);
 int trace_define_field(struct ftrace_event_call *call, char *type,
-                       char *name, int offset, int size)
+                       char *name, int offset, int size, int is_signed)
 {
        struct ftrace_event_field *field;
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
        field->offset = offset;
        field->size = size;
+        field->is_signed = is_signed;
        list_add(&field->link, &call->fields);
        return 0;
@@ -51,47 +58,94 @@ err:
        return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(trace_define_field);
-static void ftrace_clear_events(void)
+#ifdef CONFIG_MODULES
-{
-        struct ftrace_event_call *call = (void *)__start_ftrace_events;
-        while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+        struct ftrace_event_field *field, *next;
-                if (call->enabled) {
+        list_for_each_entry_safe(field, next, &call->fields, link) {
-                        call->enabled = 0;
+                list_del(&field->link);
-                        call->unregfunc();
+                kfree(field->type);
-                }
+                kfree(field->name);
-                call++;
+                kfree(field);
        }
 }
+#endif /* CONFIG_MODULES */
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
        switch (enable) {
        case 0:
                if (call->enabled) {
                        call->enabled = 0;
+                        tracing_stop_cmdline_record();
                        call->unregfunc();
                }
                break;
        case 1:
                if (!call->enabled) {
                        call->enabled = 1;
+                        tracing_start_cmdline_record();
                        call->regfunc();
                }
                break;
        }
 }
+static void ftrace_clear_events(void)
+{
+        struct ftrace_event_call *call;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(call, &ftrace_events, list) {
+                ftrace_event_enable_disable(call, 0);
+        }
+        mutex_unlock(&event_mutex);
+}
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+                                  const char *event, int set)
+{
+        struct ftrace_event_call *call;
+        int ret = -EINVAL;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (!call->name || !call->regfunc)
+                        continue;
+                if (match &&
+                    strcmp(match, call->name) != 0 &&
+                    strcmp(match, call->system) != 0)
+                        continue;
+                if (sub && strcmp(sub, call->system) != 0)
+                        continue;
+                if (event && strcmp(event, call->name) != 0)
+                        continue;
+                ftrace_event_enable_disable(call, set);
+                ret = 0;
+        }
+        mutex_unlock(&event_mutex);
+        return ret;
+}
 static int ftrace_set_clr_event(char *buf, int set)
 {
-        struct ftrace_event_call *call = __start_ftrace_events;
        char *event = NULL, *sub = NULL, *match;
-        int ret = -EINVAL;
        /*
         * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
                        event = NULL;
        }
-        mutex_lock(&event_mutex);
+        return __ftrace_set_clr_event(match, sub, event, set);
-        for_each_event(call) {
+}
-                if (!call->name || !call->regfunc)
-                        continue;
-                if (match &&
-                    strcmp(match, call->name) != 0 &&
-                    strcmp(match, call->system) != 0)
-                        continue;
-                if (sub && strcmp(sub, call->system) != 0)
-                        continue;
-                if (event && strcmp(event, call->name) != 0)
-                        continue;
-                ftrace_event_enable_disable(call, set);
-                ret = 0;
-        }
-        mutex_unlock(&event_mutex);
-        return ret;
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+        return __ftrace_set_clr_event(NULL, system, event, set);
 }
 /* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct ftrace_event_call *call = m->private;
+        struct list_head *list = m->private;
-        struct ftrace_event_call *next = call;
+        struct ftrace_event_call *call;
        (*pos)++;
        for (;;) {
-                if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+                if (list == &ftrace_events)
                        return NULL;
+                call = list_entry(list, struct ftrace_event_call, list);
                /*
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
@@ -240,46 +290,68 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                if (call->regfunc)
                        break;
-                call++;
+                list = list->next;
-                next = call;
        }
-        m->private = ++next;
+        m->private = list->next;
        return call;
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        return t_next(m, NULL, pos);
+        struct ftrace_event_call *call = NULL;
+        loff_t l;
+        mutex_lock(&event_mutex);
+        m->private = ftrace_events.next;
+        for (l = 0; l <= *pos; ) {
+                call = t_next(m, NULL, &l);
+                if (!call)
+                        break;
+        }
+        return call;
 }
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct ftrace_event_call *call = m->private;
+        struct list_head *list = m->private;
-        struct ftrace_event_call *next;
+        struct ftrace_event_call *call;
        (*pos)++;
 retry:
-        if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+        if (list == &ftrace_events)
                return NULL;
+        call = list_entry(list, struct ftrace_event_call, list);
        if (!call->enabled) {
-                call++;
+                list = list->next;
                goto retry;
        }
-        next = call;
+        m->private = list->next;
-        m->private = ++next;
        return call;
 }
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-        return s_next(m, NULL, pos);
+        struct ftrace_event_call *call = NULL;
+        loff_t l;
+        mutex_lock(&event_mutex);
+        m->private = ftrace_events.next;
+        for (l = 0; l <= *pos; ) {
+                call = s_next(m, NULL, &l);
+                if (!call)
+                        break;
+        }
+        return call;
 }
 static int t_show(struct seq_file *m, void *v)
@@ -295,26 +367,20 @@ static int t_show(struct seq_file *m, void *v)
 static void t_stop(struct seq_file *m, void *p)
 {
+        mutex_unlock(&event_mutex);
 }
 static int
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 {
-        int ret;
        const struct seq_operations *seq_ops;
        if ((file->f_mode & FMODE_WRITE) &&
-            !(file->f_flags & O_APPEND))
+            (file->f_flags & O_TRUNC))
                ftrace_clear_events();
        seq_ops = inode->i_private;
-        ret = seq_open(file, seq_ops);
+        return seq_open(file, seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = __start_ftrace_events;
-        }
-        return ret;
 }
 static ssize_t
@@ -374,8 +440,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        return cnt;
 }
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+                   loff_t *ppos)
+{
+        const char set_to_char[4] = { '?', '0', '1', 'X' };
+        const char *system = filp->private_data;
+        struct ftrace_event_call *call;
+        char buf[2];
+        int set = 0;
+        int ret;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (!call->name || !call->regfunc)
+                        continue;
+                if (system && strcmp(call->system, system) != 0)
+                        continue;
+                /*
+                 * We need to find out if all the events are set
+                 * or if all events or cleared, or if we have
+                 * a mixture.
+                 */
+                set |= (1 << !!call->enabled);
+                /*
+                 * If we have a mixture, no need to look further.
+                 */
+                if (set == 3)
+                        break;
+        }
+        mutex_unlock(&event_mutex);
+        buf[0] = set_to_char[set];
+        buf[1] = '\n';
+        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+        return ret;
+}
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                    loff_t *ppos)
+{
+        const char *system = filp->private_data;
+        unsigned long val;
+        char buf[64];
+        ssize_t ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (ret < 0)
+                return ret;
+        ret = tracing_update_buffers();
+        if (ret < 0)
+                return ret;
+        if (val != 0 && val != 1)
+                return -EINVAL;
+        ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+        if (ret)
+                goto out;
+        ret = cnt;
+out:
+        *ppos += cnt;
+        return ret;
+}
+extern char *__bad_type_size(void);
 #undef FIELD
 #define FIELD(type, name)                                               \
+        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
        #type, "common_" #name, offsetof(typeof(field), name),          \
                sizeof(field.name)
@@ -391,7 +542,7 @@ static int trace_write_header(struct trace_seq *s)
                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
                                "\n",
-                                FIELD(unsigned char, type),
+                                FIELD(unsigned short, type),
                                FIELD(unsigned char, flags),
                                FIELD(unsigned char, preempt_count),
                                FIELD(int, pid),
@@ -481,7 +632,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
-        filter_print_preds(call->preds, s);
+        print_event_filter(call, s);
        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
        kfree(s);
@@ -494,38 +645,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
-        char buf[64], *pbuf = buf;
+        char *buf;
-        struct filter_pred *pred;
        int err;
-        if (cnt >= sizeof(buf))
+        if (cnt >= PAGE_SIZE)
                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
+        buf = (char *)__get_free_page(GFP_TEMPORARY);
-                return -EFAULT;
+        if (!buf)
-        buf[cnt] = '\0';
-        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-        if (!pred)
                return -ENOMEM;
-        err = filter_parse(&pbuf, pred);
+        if (copy_from_user(buf, ubuf, cnt)) {
-        if (err < 0) {
+                free_page((unsigned long) buf);
-                filter_free_pred(pred);
+                return -EFAULT;
-                return err;
-        }
-        if (pred->clear) {
-                filter_free_preds(call);
-                filter_free_pred(pred);
-                return cnt;
        }
+        buf[cnt] = '\0';
-        err = filter_add_pred(call, pred);
+        err = apply_event_filter(call, buf);
-        if (err < 0) {
+        free_page((unsigned long) buf);
-                filter_free_pred(pred);
+        if (err < 0)
                return err;
-        }
        *ppos += cnt;
@@ -549,7 +688,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
-        filter_print_preds(system->preds, s);
+        print_subsystem_event_filter(system, s);
        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
        kfree(s);
@@ -562,45 +701,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                       loff_t *ppos)
 {
        struct event_subsystem *system = filp->private_data;
-        char buf[64], *pbuf = buf;
+        char *buf;
-        struct filter_pred *pred;
        int err;
-        if (cnt >= sizeof(buf))
+        if (cnt >= PAGE_SIZE)
                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
+        buf = (char *)__get_free_page(GFP_TEMPORARY);
-                return -EFAULT;
+        if (!buf)
-        buf[cnt] = '\0';
-        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-        if (!pred)
                return -ENOMEM;
-        err = filter_parse(&pbuf, pred);
+        if (copy_from_user(buf, ubuf, cnt)) {
-        if (err < 0) {
+                free_page((unsigned long) buf);
-                filter_free_pred(pred);
+                return -EFAULT;
-                return err;
-        }
-        if (pred->clear) {
-                filter_free_subsystem_preds(system);
-                filter_free_pred(pred);
-                return cnt;
        }
+        buf[cnt] = '\0';
-        err = filter_add_subsystem_pred(system, pred);
+        err = apply_subsystem_event_filter(system, buf);
-        if (err < 0) {
+        free_page((unsigned long) buf);
-                filter_free_subsystem_preds(system);
+        if (err < 0)
-                filter_free_pred(pred);
                return err;
-        }
        *ppos += cnt;
        return cnt;
 }
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+        int (*func)(struct trace_seq *s) = filp->private_data;
+        struct trace_seq *s;
+        int r;
+        if (*ppos)
+                return 0;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                return -ENOMEM;
+        trace_seq_init(s);
+        func(s);
+        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+        kfree(s);
+        return r;
+}
 static const struct seq_operations show_event_seq_ops = {
        .start = t_start,
        .next = t_next,
@@ -658,6 +808,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
        .write = subsystem_filter_write,
 };
+static const struct file_operations ftrace_system_enable_fops = {
+        .open = tracing_open_generic,
+        .read = system_enable_read,
+        .write = system_enable_write,
+};
+static const struct file_operations ftrace_show_header_fops = {
+        .open = tracing_open_generic,
+        .read = show_header,
+};
 static struct dentry *event_trace_events_dir(void)
 {
        static struct dentry *d_tracer;
@@ -684,6 +845,7 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
        struct event_subsystem *system;
+        struct dentry *entry;
        /* First see if we did not already create this dir */
        list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +869,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                return d_events;
        }
-        system->name = name;
+        system->name = kstrdup(name, GFP_KERNEL);
+        if (!system->name) {
+                debugfs_remove(system->entry);
+                kfree(system);
+                return d_events;
+        }
        list_add(&system->list, &event_subsystems);
-        system->preds = NULL;
+        system->filter = NULL;
+        system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+        if (!system->filter) {
+                pr_warning("Could not allocate filter for subsystem "
+                           "'%s'\n", name);
+                return system->entry;
+        }
+        entry = debugfs_create_file("filter", 0644, system->entry, system,
+                                    &ftrace_subsystem_filter_fops);
+        if (!entry) {
+                kfree(system->filter);
+                system->filter = NULL;
+                pr_warning("Could not create debugfs "
+                           "'%s/filter' entry\n", name);
+        }
+        entry = trace_create_file("enable", 0644, system->entry,
+                                  (void *)system->name,
+                                  &ftrace_system_enable_fops);
        return system->entry;
 }
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+                 const struct file_operations *id,
+                 const struct file_operations *enable,
+                 const struct file_operations *filter,
+                 const struct file_operations *format)
 {
        struct dentry *entry;
        int ret;
@@ -725,7 +917,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
         * If the trace point header did not define TRACE_SYSTEM
         * then the system would be called "TRACE_SYSTEM".
         */
-        if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+        if (strcmp(call->system, TRACE_SYSTEM) != 0)
                d_events = event_subsystem_dir(call->system, d_events);
        if (call->raw_init) {
@@ -744,21 +936,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
                return -1;
        }
-        if (call->regfunc) {
+        if (call->regfunc)
-                entry = debugfs_create_file("enable", 0644, call->dir, call,
+                entry = trace_create_file("enable", 0644, call->dir, call,
-                                            &ftrace_enable_fops);
+                                          enable);
-                if (!entry)
-                        pr_warning("Could not create debugfs "
-                                   "'%s/enable' entry\n", call->name);
-        }
-        if (call->id) {
+        if (call->id && call->profile_enable)
-                entry = debugfs_create_file("id", 0444, call->dir, call,
+                entry = trace_create_file("id", 0444, call->dir, call,
-                                &ftrace_event_id_fops);
+                                          id);
-                if (!entry)
-                        pr_warning("Could not create debugfs '%s/id' entry\n",
-                                        call->name);
-        }
        if (call->define_fields) {
                ret = call->define_fields();
@@ -767,32 +951,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
                                   " events/%s\n", call->name);
                        return ret;
                }
-                entry = debugfs_create_file("filter", 0644, call->dir, call,
+                entry = trace_create_file("filter", 0644, call->dir, call,
-                                            &ftrace_event_filter_fops);
+                                          filter);
-                if (!entry)
-                        pr_warning("Could not create debugfs "
-                                   "'%s/filter' entry\n", call->name);
        }
        /* A trace may not want to export its format */
        if (!call->show_format)
                return 0;
-        entry = debugfs_create_file("format", 0444, call->dir, call,
+        entry = trace_create_file("format", 0444, call->dir, call,
-                                    &ftrace_event_format_fops);
+                                  format);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'%s/format' entry\n", call->name);
        return 0;
 }
+#define for_each_event(event, start, end)                       \
+        for (event = start;                                     \
+             (unsigned long)event < (unsigned long)end;         \
+             event++)
+#ifdef CONFIG_MODULES
+static LIST_HEAD(ftrace_module_file_list);
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+        struct list_head                list;
+        struct module                   *mod;
+        struct file_operations          id;
+        struct file_operations          enable;
+        struct file_operations          format;
+        struct file_operations          filter;
+};
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+        struct ftrace_module_file_ops *file_ops;
+        /*
+         * This is a bit of a PITA. To allow for correct reference
+         * counting, modules must "own" their file_operations.
+         * To do this, we allocate the file operations that will be
+         * used in the event directory.
+         */
+        file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+        if (!file_ops)
+                return NULL;
+        file_ops->mod = mod;
+        file_ops->id = ftrace_event_id_fops;
+        file_ops->id.owner = mod;
+        file_ops->enable = ftrace_enable_fops;
+        file_ops->enable.owner = mod;
+        file_ops->filter = ftrace_event_filter_fops;
+        file_ops->filter.owner = mod;
+        file_ops->format = ftrace_event_format_fops;
+        file_ops->format.owner = mod;
+        list_add(&file_ops->list, &ftrace_module_file_list);
+        return file_ops;
+}
+static void trace_module_add_events(struct module *mod)
+{
+        struct ftrace_module_file_ops *file_ops = NULL;
+        struct ftrace_event_call *call, *start, *end;
+        struct dentry *d_events;
+        start = mod->trace_events;
+        end = mod->trace_events + mod->num_trace_events;
+        if (start == end)
+                return;
+        d_events = event_trace_events_dir();
+        if (!d_events)
+                return;
+        for_each_event(call, start, end) {
+                /* The linker may leave blanks */
+                if (!call->name)
+                        continue;
+                /*
+                 * This module has events, create file ops for this module
+                 * if not already done.
+                 */
+                if (!file_ops) {
+                        file_ops = trace_create_file_ops(mod);
+                        if (!file_ops)
+                                return;
+                }
+                call->mod = mod;
+                list_add(&call->list, &ftrace_events);
+                event_create_dir(call, d_events,
+                                 &file_ops->id, &file_ops->enable,
+                                 &file_ops->filter, &file_ops->format);
+        }
+}
+static void trace_module_remove_events(struct module *mod)
+{
+        struct ftrace_module_file_ops *file_ops;
+        struct ftrace_event_call *call, *p;
+        bool found = false;
+        down_write(&trace_event_mutex);
+        list_for_each_entry_safe(call, p, &ftrace_events, list) {
+                if (call->mod == mod) {
+                        found = true;
+                        ftrace_event_enable_disable(call, 0);
+                        if (call->event)
+                                __unregister_ftrace_event(call->event);
+                        debugfs_remove_recursive(call->dir);
+                        list_del(&call->list);
+                        trace_destroy_fields(call);
+                        destroy_preds(call);
+                }
+        }
+        /* Now free the file_operations */
+        list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+                if (file_ops->mod == mod)
+                        break;
+        }
+        if (&file_ops->list != &ftrace_module_file_list) {
+                list_del(&file_ops->list);
+                kfree(file_ops);
+        }
+        /*
+         * It is safest to reset the ring buffer if the module being unloaded
+         * registered any events.
+         */
+        if (found)
+                tracing_reset_current_online_cpus();
+        up_write(&trace_event_mutex);
+}
+static int trace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+        struct module *mod = data;
+        mutex_lock(&event_mutex);
+        switch (val) {
+        case MODULE_STATE_COMING:
+                trace_module_add_events(mod);
+                break;
+        case MODULE_STATE_GOING:
+                trace_module_remove_events(mod);
+                break;
+        }
+        mutex_unlock(&event_mutex);
+        return 0;
+}
+#else
+static int trace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+        return 0;
+}
+#endif /* CONFIG_MODULES */
+struct notifier_block trace_module_nb = {
+        .notifier_call = trace_module_notify,
+        .priority = 0,
+};
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
 static __init int event_trace_init(void)
 {
-        struct ftrace_event_call *call = __start_ftrace_events;
+        struct ftrace_event_call *call;
        struct dentry *d_tracer;
        struct dentry *entry;
        struct dentry *d_events;
+        int ret;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
@@ -816,13 +1163,243 @@ static __init int event_trace_init(void)
        if (!d_events)
                return 0;
-        for_each_event(call) {
+        /* ring buffer internal formats */
+        trace_create_file("header_page", 0444, d_events,
+                          ring_buffer_print_page_header,
+                          &ftrace_show_header_fops);
+        trace_create_file("header_event", 0444, d_events,
+                          ring_buffer_print_entry_header,
+                          &ftrace_show_header_fops);
+        trace_create_file("enable", 0644, d_events,
+                          NULL, &ftrace_system_enable_fops);
+        for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-                event_create_dir(call, d_events);
+                list_add(&call->list, &ftrace_events);
+                event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                 &ftrace_enable_fops, &ftrace_event_filter_fops,
+                                 &ftrace_event_format_fops);
        }
+        ret = register_module_notifier(&trace_module_nb);
+        if (ret)
+                pr_warning("Failed to register trace events module notifier\n");
        return 0;
 }
 fs_initcall(event_trace_init);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+static __init void test_work(struct work_struct *dummy)
+{
+        spin_lock(&test_spinlock);
+        spin_lock_irq(&test_spinlock_irq);
+        udelay(1);
+        spin_unlock_irq(&test_spinlock_irq);
+        spin_unlock(&test_spinlock);
+        mutex_lock(&test_mutex);
+        msleep(1);
+        mutex_unlock(&test_mutex);
+}
+static __init int event_test_thread(void *unused)
+{
+        void *test_malloc;
+        test_malloc = kmalloc(1234, GFP_KERNEL);
+        if (!test_malloc)
+                pr_info("failed to kmalloc\n");
+        schedule_on_each_cpu(test_work);
+        kfree(test_malloc);
+        set_current_state(TASK_INTERRUPTIBLE);
+        while (!kthread_should_stop())
+                schedule();
+        return 0;
+}
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+        struct task_struct *test_thread;
+        test_thread = kthread_run(event_test_thread, NULL, "test-events");
+        msleep(1);
+        kthread_stop(test_thread);
+}
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+        struct ftrace_event_call *call;
+        struct event_subsystem *system;
+        int ret;
+        pr_info("Running tests on trace events:\n");
+        list_for_each_entry(call, &ftrace_events, list) {
+                /* Only test those that have a regfunc */
+                if (!call->regfunc)
+                        continue;
+                pr_info("Testing event %s: ", call->name);
+                /*
+                 * If an event is already enabled, someone is using
+                 * it and the self test should not be on.
+                 */
+                if (call->enabled) {
+                        pr_warning("Enabled event during self test!\n");
+                        WARN_ON_ONCE(1);
+                        continue;
+                }
+                ftrace_event_enable_disable(call, 1);
+                event_test_stuff();
+                ftrace_event_enable_disable(call, 0);
+                pr_cont("OK\n");
+        }
+        /* Now test at the sub system level */
+        pr_info("Running tests on trace event systems:\n");
+        list_for_each_entry(system, &event_subsystems, list) {
+                /* the ftrace system is special, skip it */
+                if (strcmp(system->name, "ftrace") == 0)
+                        continue;
+                pr_info("Testing event system %s: ", system->name);
+                ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+                if (WARN_ON_ONCE(ret)) {
+                        pr_warning("error enabling system %s\n",
+                                   system->name);
+                        continue;
+                }
+                event_test_stuff();
+                ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+                if (WARN_ON_ONCE(ret))
+                        pr_warning("error disabling system %s\n",
+                                   system->name);
+                pr_cont("OK\n");
+        }
+        /* Test with all events enabled */
+        pr_info("Running tests on all trace events:\n");
+        pr_info("Testing all events: ");
+        ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error enabling all events\n");
+                return;
+        }
+        event_test_stuff();
+        /* reset sysname */
+        ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error disabling all events\n");
+                return;
+        }
+        pr_cont("OK\n");
+}
+#ifdef CONFIG_FUNCTION_TRACER
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct ring_buffer_event *event;
+        struct ftrace_entry *entry;
+        unsigned long flags;
+        long disabled;
+        int resched;
+        int cpu;
+        int pc;
+        pc = preempt_count();
+        resched = ftrace_preempt_disable();
+        cpu = raw_smp_processor_id();
+        disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+        if (disabled != 1)
+                goto out;
+        local_save_flags(flags);
+        event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+                                                  flags, pc);
+        if (!event)
+                goto out;
+        entry   = ring_buffer_event_data(event);
+        entry->ip                       = ip;
+        entry->parent_ip                = parent_ip;
+        trace_nowake_buffer_unlock_commit(event, flags, pc);
+ out:
+        atomic_dec(&per_cpu(test_event_disable, cpu));
+        ftrace_preempt_enable(resched);
+}
+static struct ftrace_ops trace_ops __initdata  =
+{
+        .func = function_test_events_call,
+};
+static __init void event_trace_self_test_with_function(void)
+{
+        register_ftrace_function(&trace_ops);
+        pr_info("Running tests again, along with the function tracer\n");
+        event_trace_self_tests();
+        unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+static __init int event_trace_self_tests_init(void)
+{
+        event_trace_self_tests();
+        event_trace_self_test_with_function();
+        return 0;
+}
+late_initcall(event_trace_self_tests_init);
+#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e03cbf1e38f3..f32dc9d1ea7b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,119 +22,295 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/mutex.h>
 #include "trace.h"
 #include "trace_output.h"
-static int filter_pred_64(struct filter_pred *pred, void *event)
+enum filter_op_ids
 {
-        u64 *addr = (u64 *)(event + pred->offset);
+        OP_OR,
-        u64 val = (u64)pred->val;
+        OP_AND,
-        int match;
+        OP_NE,
+        OP_EQ,
+        OP_LT,
+        OP_LE,
+        OP_GT,
+        OP_GE,
+        OP_NONE,
+        OP_OPEN_PAREN,
+};
+struct filter_op {
+        int id;
+        char *string;
+        int precedence;
+};
+static struct filter_op filter_ops[] = {
+        { OP_OR, "||", 1 },
+        { OP_AND, "&&", 2 },
+        { OP_NE, "!=", 4 },
+        { OP_EQ, "==", 4 },
+        { OP_LT, "<", 5 },
+        { OP_LE, "<=", 5 },
+        { OP_GT, ">", 5 },
+        { OP_GE, ">=", 5 },
+        { OP_NONE, "OP_NONE", 0 },
+        { OP_OPEN_PAREN, "(", 0 },
+};
+enum {
+        FILT_ERR_NONE,
+        FILT_ERR_INVALID_OP,
+        FILT_ERR_UNBALANCED_PAREN,
+        FILT_ERR_TOO_MANY_OPERANDS,
+        FILT_ERR_OPERAND_TOO_LONG,
+        FILT_ERR_FIELD_NOT_FOUND,
+        FILT_ERR_ILLEGAL_FIELD_OP,
+        FILT_ERR_ILLEGAL_INTVAL,
+        FILT_ERR_BAD_SUBSYS_FILTER,
+        FILT_ERR_TOO_MANY_PREDS,
+        FILT_ERR_MISSING_FIELD,
+        FILT_ERR_INVALID_FILTER,
+};
+static char *err_text[] = {
+        "No error",
+        "Invalid operator",
+        "Unbalanced parens",
+        "Too many operands",
+        "Operand too long",
+        "Field not found",
+        "Illegal operation for field type",
+        "Illegal integer value",
+        "Couldn't find or set field in one of a subsystem's events",
+        "Too many terms in predicate expression",
+        "Missing field name and/or value",
+        "Meaningless filter expression",
+};
+struct opstack_op {
+        int op;
+        struct list_head list;
+};
+struct postfix_elt {
+        int op;
+        char *operand;
+        struct list_head list;
+};
+struct filter_parse_state {
+        struct filter_op *ops;
+        struct list_head opstack;
+        struct list_head postfix;
+        int lasterr;
+        int lasterr_pos;
+        struct {
+                char *string;
+                unsigned int cnt;
+                unsigned int tail;
+        } infix;
+        struct {
+                char string[MAX_FILTER_STR_VAL];
+                int pos;
+                unsigned int tail;
+        } operand;
+};
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+                           void *event __attribute((unused)),
+                           int val1, int val2)
+{
+        return val1 && val2;
+}
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+                          void *event __attribute((unused)),
+                          int val1, int val2)
+{
+        return val1 || val2;
+}
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event,
+                              int val1, int val2)
+{
+        char *addr = (char *)(event + pred->offset);
+        int cmp, match;
-        match = (val == *addr) ^ pred->not;
+        cmp = strncmp(addr, pred->str_val, pred->str_len);
+        match = (!cmp) ^ pred->not;
        return match;
 }
-static int filter_pred_32(struct filter_pred *pred, void *event)
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+                              int val1, int val2)
 {
-        u32 *addr = (u32 *)(event + pred->offset);
+        unsigned short str_loc = *(unsigned short *)(event + pred->offset);
-        u32 val = (u32)pred->val;
+        char *addr = (char *)(event + str_loc);
-        int match;
+        int cmp, match;
-        match = (val == *addr) ^ pred->not;
+        cmp = strncmp(addr, pred->str_val, pred->str_len);
+        match = (!cmp) ^ pred->not;
        return match;
 }
-static int filter_pred_16(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+                            int val1, int val2)
+{
+        return 0;
+}
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
-        u16 *addr = (u16 *)(event + pred->offset);
+        struct event_filter *filter = call->filter;
-        u16 val = (u16)pred->val;
+        int match, top = 0, val1 = 0, val2 = 0;
-        int match;
+        int stack[MAX_FILTER_PRED];
+        struct filter_pred *pred;
+        int i;
-        match = (val == *addr) ^ pred->not;
+        for (i = 0; i < filter->n_preds; i++) {
+                pred = filter->preds[i];
+                if (!pred->pop_n) {
+                        match = pred->fn(pred, rec, val1, val2);
+                        stack[top++] = match;
+                        continue;
+                }
+                if (pred->pop_n > top) {
+                        WARN_ON_ONCE(1);
+                        return 0;
+                }
+                val1 = stack[--top];
+                val2 = stack[--top];
+                match = pred->fn(pred, rec, val1, val2);
+                stack[top++] = match;
+        }
-        return match;
+        return stack[--top];
 }
+EXPORT_SYMBOL_GPL(filter_match_preds);
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
 {
-        u8 *addr = (u8 *)(event + pred->offset);
+        ps->lasterr = err;
-        u8 val = (u8)pred->val;
+        ps->lasterr_pos = pos;
-        int match;
+}
-        match = (val == *addr) ^ pred->not;
+static void remove_filter_string(struct event_filter *filter)
+{
+        kfree(filter->filter_string);
+        filter->filter_string = NULL;
+}
-        return match;
+static int replace_filter_string(struct event_filter *filter,
+                                 char *filter_string)
+{
+        kfree(filter->filter_string);
+        filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+        if (!filter->filter_string)
+                return -ENOMEM;
+        return 0;
 }
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int append_filter_string(struct event_filter *filter,
+                                char *string)
 {
-        char *addr = (char *)(event + pred->offset);
+        int newlen;
-        int cmp, match;
+        char *new_filter_string;
-        cmp = strncmp(addr, pred->str_val, pred->str_len);
+        BUG_ON(!filter->filter_string);
+        newlen = strlen(filter->filter_string) + strlen(string) + 1;
+        new_filter_string = kmalloc(newlen, GFP_KERNEL);
+        if (!new_filter_string)
+                return -ENOMEM;
-        match = (!cmp) ^ pred->not;
+        strcpy(new_filter_string, filter->filter_string);
+        strcat(new_filter_string, string);
+        kfree(filter->filter_string);
+        filter->filter_string = new_filter_string;
-        return match;
+        return 0;
 }
-/* return 1 if event matches, 0 otherwise (discard) */
+static void append_filter_err(struct filter_parse_state *ps,
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+                              struct event_filter *filter)
 {
-        int i, matched, and_failed = 0;
+        int pos = ps->lasterr_pos;
-        struct filter_pred *pred;
+        char *buf, *pbuf;
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+        buf = (char *)__get_free_page(GFP_TEMPORARY);
-                if (call->preds[i]) {
+        if (!buf)
-                        pred = call->preds[i];
+                return;
-                        if (and_failed && !pred->or)
-                                continue;
-                        matched = pred->fn(pred, rec);
-                        if (!matched && !pred->or) {
-                                and_failed = 1;
-                                continue;
-                        } else if (matched && pred->or)
-                                return 1;
-                } else
-                        break;
-        }
-        if (and_failed)
+        append_filter_string(filter, "\n");
-                return 0;
+        memset(buf, ' ', PAGE_SIZE);
+        if (pos > PAGE_SIZE - 128)
+                pos = 0;
+        buf[pos] = '^';
+        pbuf = &buf[pos] + 1;
-        return 1;
+        sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+        append_filter_string(filter, buf);
+        free_page((unsigned long) buf);
 }
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-        char *field_name;
+        struct event_filter *filter = call->filter;
-        struct filter_pred *pred;
-        int i;
-        if (!preds) {
+        mutex_lock(&event_mutex);
+        if (filter->filter_string)
+                trace_seq_printf(s, "%s\n", filter->filter_string);
+        else
                trace_seq_printf(s, "none\n");
-                return;
+        mutex_unlock(&event_mutex);
-        }
+}
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+void print_subsystem_event_filter(struct event_subsystem *system,
-                if (preds[i]) {
+                                  struct trace_seq *s)
-                        pred = preds[i];
+{
-                        field_name = pred->field_name;
+        struct event_filter *filter = system->filter;
-                        if (i)
-                                trace_seq_printf(s, pred->or ? "|| " : "&& ");
+        mutex_lock(&event_mutex);
-                        trace_seq_printf(s, "%s ", field_name);
+        if (filter->filter_string)
-                        trace_seq_printf(s, pred->not ? "!= " : "== ");
+                trace_seq_printf(s, "%s\n", filter->filter_string);
-                        if (pred->str_val)
+        else
-                                trace_seq_printf(s, "%s\n", pred->str_val);
+                trace_seq_printf(s, "none\n");
-                        else
+        mutex_unlock(&event_mutex);
-                                trace_seq_printf(s, "%llu\n", pred->val);
-                } else
-                        break;
-        }
 }
 static struct ftrace_event_field *
@@ -150,284 +326,839 @@ find_event_field(struct ftrace_event_call *call, char *name)
        return NULL;
 }
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
        if (!pred)
                return;
        kfree(pred->field_name);
-        kfree(pred->str_val);
        kfree(pred);
 }
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
 {
-        int i;
+        kfree(pred->field_name);
+        pred->field_name = NULL;
+        pred->str_len = 0;
+}
-        if (call->preds) {
+static int filter_set_pred(struct filter_pred *dest,
-                for (i = 0; i < MAX_FILTER_PRED; i++)
+                           struct filter_pred *src,
-                        filter_free_pred(call->preds[i]);
+                           filter_pred_fn_t fn)
-                kfree(call->preds);
+{
-                call->preds = NULL;
+        *dest = *src;
+        if (src->field_name) {
+                dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+                if (!dest->field_name)
+                        return -ENOMEM;
        }
+        dest->fn = fn;
+        return 0;
 }
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
-        struct ftrace_event_call *call = __start_ftrace_events;
+        struct event_filter *filter = call->filter;
        int i;
-        if (system->preds) {
+        call->filter_active = 0;
-                for (i = 0; i < MAX_FILTER_PRED; i++)
+        filter->n_preds = 0;
-                        filter_free_pred(system->preds[i]);
-                kfree(system->preds);
-                system->preds = NULL;
-        }
-        events_for_each(call) {
+        for (i = 0; i < MAX_FILTER_PRED; i++)
-                if (!call->name || !call->regfunc)
+                filter->preds[i]->fn = filter_pred_none;
-                        continue;
+}
+void destroy_preds(struct ftrace_event_call *call)
+{
+        struct event_filter *filter = call->filter;
+        int i;
-                if (!strcmp(call->system, system->name))
+        for (i = 0; i < MAX_FILTER_PRED; i++) {
-                        filter_free_preds(call);
+                if (filter->preds[i])
+                        filter_free_pred(filter->preds[i]);
        }
+        kfree(filter->preds);
+        kfree(filter->filter_string);
+        kfree(filter);
+        call->filter = NULL;
 }
-static int __filter_add_pred(struct ftrace_event_call *call,
+int init_preds(struct ftrace_event_call *call)
-                             struct filter_pred *pred)
 {
+        struct event_filter *filter;
+        struct filter_pred *pred;
        int i;
-        if (call->preds && !pred->compound)
+        filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-                filter_free_preds(call);
+        if (!call->filter)
+                return -ENOMEM;
-        if (!call->preds) {
+        call->filter_active = 0;
-                call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+        filter->n_preds = 0;
-                                      GFP_KERNEL);
-                if (!call->preds)
+        filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
-                        return -ENOMEM;
+        if (!filter->preds)
-        }
+                goto oom;
        for (i = 0; i < MAX_FILTER_PRED; i++) {
-                if (!call->preds[i]) {
+                pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-                        call->preds[i] = pred;
+                if (!pred)
-                        return 0;
+                        goto oom;
+                pred->fn = filter_pred_none;
+                filter->preds[i] = pred;
+        }
+        return 0;
+oom:
+        destroy_preds(call);
+        return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(init_preds);
+static void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+        struct event_filter *filter = system->filter;
+        struct ftrace_event_call *call;
+        int i;
+        if (filter->n_preds) {
+                for (i = 0; i < filter->n_preds; i++)
+                        filter_free_pred(filter->preds[i]);
+                kfree(filter->preds);
+                filter->preds = NULL;
+                filter->n_preds = 0;
+        }
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (!call->define_fields)
+                        continue;
+                if (!strcmp(call->system, system->name)) {
+                        filter_disable_preds(call);
+                        remove_filter_string(call->filter);
                }
        }
+}
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+                              struct ftrace_event_call *call,
+                              struct filter_pred *pred,
+                              filter_pred_fn_t fn)
+{
+        struct event_filter *filter = call->filter;
+        int idx, err;
+        if (filter->n_preds == MAX_FILTER_PRED) {
+                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+                return -ENOSPC;
+        }
-        return -ENOSPC;
+        idx = filter->n_preds;
+        filter_clear_pred(filter->preds[idx]);
+        err = filter_set_pred(filter->preds[idx], pred, fn);
+        if (err)
+                return err;
+        filter->n_preds++;
+        call->filter_active = 1;
+        return 0;
 }
+enum {
+        FILTER_STATIC_STRING = 1,
+        FILTER_DYN_STRING
+};
 static int is_string_field(const char *type)
 {
+        if (strstr(type, "__data_loc") && strstr(type, "char"))
+                return FILTER_DYN_STRING;
        if (strchr(type, '[') && strstr(type, "char"))
-                return 1;
+                return FILTER_STATIC_STRING;
        return 0;
 }
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+        if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+                return 0;
+        return 1;
+}
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+                                             int field_is_signed)
+{
+        filter_pred_fn_t fn = NULL;
+        switch (field_size) {
+        case 8:
+                if (op == OP_EQ || op == OP_NE)
+                        fn = filter_pred_64;
+                else if (field_is_signed)
+                        fn = filter_pred_s64;
+                else
+                        fn = filter_pred_u64;
+                break;
+        case 4:
+                if (op == OP_EQ || op == OP_NE)
+                        fn = filter_pred_32;
+                else if (field_is_signed)
+                        fn = filter_pred_s32;
+                else
+                        fn = filter_pred_u32;
+                break;
+        case 2:
+                if (op == OP_EQ || op == OP_NE)
+                        fn = filter_pred_16;
+                else if (field_is_signed)
+                        fn = filter_pred_s16;
+                else
+                        fn = filter_pred_u16;
+                break;
+        case 1:
+                if (op == OP_EQ || op == OP_NE)
+                        fn = filter_pred_8;
+                else if (field_is_signed)
+                        fn = filter_pred_s8;
+                else
+                        fn = filter_pred_u8;
+                break;
+        }
+        return fn;
+}
+static int filter_add_pred(struct filter_parse_state *ps,
+                           struct ftrace_event_call *call,
+                           struct filter_pred *pred)
 {
        struct ftrace_event_field *field;
+        filter_pred_fn_t fn;
+        unsigned long long val;
+        int string_type;
+        int ret;
+        pred->fn = filter_pred_none;
+        if (pred->op == OP_AND) {
+                pred->pop_n = 2;
+                return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+        } else if (pred->op == OP_OR) {
+                pred->pop_n = 2;
+                return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+        }
        field = find_event_field(call, pred->field_name);
-        if (!field)
+        if (!field) {
+                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
                return -EINVAL;
+        }
        pred->offset = field->offset;
-        if (is_string_field(field->type)) {
+        if (!is_legal_op(field, pred->op)) {
-                if (!pred->str_val)
+                parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
-                        return -EINVAL;
+                return -EINVAL;
-                pred->fn = filter_pred_string;
+        }
+        string_type = is_string_field(field->type);
+        if (string_type) {
+                if (string_type == FILTER_STATIC_STRING)
+                        fn = filter_pred_string;
+                else
+                        fn = filter_pred_strloc;
                pred->str_len = field->size;
-                return __filter_add_pred(call, pred);
+                if (pred->op == OP_NE)
+                        pred->not = 1;
+                return filter_add_pred_fn(ps, call, pred, fn);
        } else {
-                if (pred->str_val)
+                if (field->is_signed)
+                        ret = strict_strtoll(pred->str_val, 0, &val);
+                else
+                        ret = strict_strtoull(pred->str_val, 0, &val);
+                if (ret) {
+                        parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
                        return -EINVAL;
+                }
+                pred->val = val;
        }
-        switch (field->size) {
+        fn = select_comparison_fn(pred->op, field->size, field->is_signed);
-        case 8:
+        if (!fn) {
-                pred->fn = filter_pred_64;
+                parse_error(ps, FILT_ERR_INVALID_OP, 0);
-                break;
-        case 4:
-                pred->fn = filter_pred_32;
-                break;
-        case 2:
-                pred->fn = filter_pred_16;
-                break;
-        case 1:
-                pred->fn = filter_pred_8;
-                break;
-        default:
                return -EINVAL;
        }
-        return __filter_add_pred(call, pred);
+        if (pred->op == OP_NE)
+                pred->not = 1;
+        return filter_add_pred_fn(ps, call, pred, fn);
 }
-static struct filter_pred *copy_pred(struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+                                     struct event_subsystem *system,
+                                     struct filter_pred *pred,
+                                     char *filter_string)
 {
-        struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
+        struct event_filter *filter = system->filter;
-        if (!new_pred)
+        struct ftrace_event_call *call;
-                return NULL;
+        int err = 0;
-        memcpy(new_pred, pred, sizeof(*pred));
+        if (!filter->preds) {
+                filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+                                        GFP_KERNEL);
-        if (pred->field_name) {
+                if (!filter->preds)
-                new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+                        return -ENOMEM;
-                if (!new_pred->field_name) {
-                        kfree(new_pred);
-                        return NULL;
-                }
        }
-        if (pred->str_val) {
+        if (filter->n_preds == MAX_FILTER_PRED) {
-                new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
+                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-                if (!new_pred->str_val) {
+                return -ENOSPC;
-                        filter_free_pred(new_pred);
+        }
-                        return NULL;
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (!call->define_fields)
+                        continue;
+                if (strcmp(call->system, system->name))
+                        continue;
+                err = filter_add_pred(ps, call, pred);
+                if (err) {
+                        filter_free_subsystem_preds(system);
+                        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                        goto out;
                }
+                replace_filter_string(call->filter, filter_string);
        }
-        return new_pred;
+        filter->preds[filter->n_preds] = pred;
+        filter->n_preds++;
+out:
+        return err;
 }
-int filter_add_subsystem_pred(struct event_subsystem *system,
+static void parse_init(struct filter_parse_state *ps,
-                              struct filter_pred *pred)
+                       struct filter_op *ops,
+                       char *infix_string)
 {
-        struct ftrace_event_call *call = __start_ftrace_events;
+        memset(ps, '\0', sizeof(*ps));
-        struct filter_pred *event_pred;
-        int i;
-        if (system->preds && !pred->compound)
+        ps->infix.string = infix_string;
-                filter_free_subsystem_preds(system);
+        ps->infix.cnt = strlen(infix_string);
+        ps->ops = ops;
-        if (!system->preds) {
+        INIT_LIST_HEAD(&ps->opstack);
-                system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+        INIT_LIST_HEAD(&ps->postfix);
-                                        GFP_KERNEL);
+}
-                if (!system->preds)
-                        return -ENOMEM;
+static char infix_next(struct filter_parse_state *ps)
+{
+        ps->infix.cnt--;
+        return ps->infix.string[ps->infix.tail++];
+}
+static char infix_peek(struct filter_parse_state *ps)
+{
+        if (ps->infix.tail == strlen(ps->infix.string))
+                return 0;
+        return ps->infix.string[ps->infix.tail];
+}
+static void infix_advance(struct filter_parse_state *ps)
+{
+        ps->infix.cnt--;
+        ps->infix.tail++;
+}
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+                                      int a, int b)
+{
+        return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+        int i;
+        for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+                if (ps->ops[i].string[0] == c)
+                        return 1;
        }
-        for (i = 0; i < MAX_FILTER_PRED; i++) {
+        return 0;
-                if (!system->preds[i]) {
+}
-                        system->preds[i] = pred;
-                        break;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+        char nextc = infix_peek(ps);
+        char opstr[3];
+        int i;
+        opstr[0] = firstc;
+        opstr[1] = nextc;
+        opstr[2] = '\0';
+        for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+                if (!strcmp(opstr, ps->ops[i].string)) {
+                        infix_advance(ps);
+                        return ps->ops[i].id;
                }
        }
-        if (i == MAX_FILTER_PRED)
+        opstr[1] = '\0';
-                return -ENOSPC;
+        for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+                if (!strcmp(opstr, ps->ops[i].string))
+                        return ps->ops[i].id;
+        }
+        return OP_NONE;
+}
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+        memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+        ps->operand.tail = 0;
+}
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+        if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+                return -EINVAL;
+        ps->operand.string[ps->operand.tail++] = c;
+        return 0;
+}
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+        struct opstack_op *opstack_op;
-        events_for_each(call) {
+        opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
-                int err;
+        if (!opstack_op)
+                return -ENOMEM;
+        opstack_op->op = op;
+        list_add(&opstack_op->list, &ps->opstack);
+        return 0;
+}
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+        return list_empty(&ps->opstack);
+}
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+        struct opstack_op *opstack_op;
+        if (filter_opstack_empty(ps))
+                return OP_NONE;
+        opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+        return opstack_op->op;
+}
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+        struct opstack_op *opstack_op;
+        int op;
+        if (filter_opstack_empty(ps))
+                return OP_NONE;
+        opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+        op = opstack_op->op;
+        list_del(&opstack_op->list);
+        kfree(opstack_op);
+        return op;
+}
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+        while (!filter_opstack_empty(ps))
+                filter_opstack_pop(ps);
+}
-                if (!call->name || !call->regfunc)
+static char *curr_operand(struct filter_parse_state *ps)
+{
+        return ps->operand.string;
+}
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+        struct postfix_elt *elt;
+        elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+        if (!elt)
+                return -ENOMEM;
+        elt->op = OP_NONE;
+        elt->operand = kstrdup(operand, GFP_KERNEL);
+        if (!elt->operand) {
+                kfree(elt);
+                return -ENOMEM;
+        }
+        list_add_tail(&elt->list, &ps->postfix);
+        return 0;
+}
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+        struct postfix_elt *elt;
+        elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+        if (!elt)
+                return -ENOMEM;
+        elt->op = op;
+        elt->operand = NULL;
+        list_add_tail(&elt->list, &ps->postfix);
+        return 0;
+}
+static void postfix_clear(struct filter_parse_state *ps)
+{
+        struct postfix_elt *elt;
+        while (!list_empty(&ps->postfix)) {
+                elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+                kfree(elt->operand);
+                list_del(&elt->list);
+        }
+}
+static int filter_parse(struct filter_parse_state *ps)
+{
+        int in_string = 0;
+        int op, top_op;
+        char ch;
+        while ((ch = infix_next(ps))) {
+                if (ch == '"') {
+                        in_string ^= 1;
                        continue;
+                }
-                if (strcmp(call->system, system->name))
+                if (in_string)
+                        goto parse_operand;
+                if (isspace(ch))
                        continue;
-                if (!find_event_field(call, pred->field_name))
+                if (is_op_char(ps, ch)) {
+                        op = infix_get_op(ps, ch);
+                        if (op == OP_NONE) {
+                                parse_error(ps, FILT_ERR_INVALID_OP, 0);
+                                return -EINVAL;
+                        }
+                        if (strlen(curr_operand(ps))) {
+                                postfix_append_operand(ps, curr_operand(ps));
+                                clear_operand_string(ps);
+                        }
+                        while (!filter_opstack_empty(ps)) {
+                                top_op = filter_opstack_top(ps);
+                                if (!is_precedence_lower(ps, top_op, op)) {
+                                        top_op = filter_opstack_pop(ps);
+                                        postfix_append_op(ps, top_op);
+                                        continue;
+                                }
+                                break;
+                        }
+                        filter_opstack_push(ps, op);
                        continue;
+                }
-                event_pred = copy_pred(pred);
+                if (ch == '(') {
-                if (!event_pred)
+                        filter_opstack_push(ps, OP_OPEN_PAREN);
-                        goto oom;
+                        continue;
+                }
-                err = filter_add_pred(call, event_pred);
+                if (ch == ')') {
-                if (err)
+                        if (strlen(curr_operand(ps))) {
-                        filter_free_pred(event_pred);
+                                postfix_append_operand(ps, curr_operand(ps));
-                if (err == -ENOMEM)
+                                clear_operand_string(ps);
-                        goto oom;
+                        }
+                        top_op = filter_opstack_pop(ps);
+                        while (top_op != OP_NONE) {
+                                if (top_op == OP_OPEN_PAREN)
+                                        break;
+                                postfix_append_op(ps, top_op);
+                                top_op = filter_opstack_pop(ps);
+                        }
+                        if (top_op == OP_NONE) {
+                                parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+                                return -EINVAL;
+                        }
+                        continue;
+                }
+parse_operand:
+                if (append_operand_char(ps, ch)) {
+                        parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+                        return -EINVAL;
+                }
+        }
+        if (strlen(curr_operand(ps)))
+                postfix_append_operand(ps, curr_operand(ps));
+        while (!filter_opstack_empty(ps)) {
+                top_op = filter_opstack_pop(ps);
+                if (top_op == OP_NONE)
+                        break;
+                if (top_op == OP_OPEN_PAREN) {
+                        parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+                        return -EINVAL;
+                }
+                postfix_append_op(ps, top_op);
        }
        return 0;
+}
-oom:
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
-        system->preds[i] = NULL;
+{
-        return -ENOMEM;
+        struct filter_pred *pred;
+        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+        if (!pred)
+                return NULL;
+        pred->field_name = kstrdup(operand1, GFP_KERNEL);
+        if (!pred->field_name) {
+                kfree(pred);
+                return NULL;
+        }
+        strcpy(pred->str_val, operand2);
+        pred->str_len = strlen(operand2);
+        pred->op = op;
+        return pred;
 }
-int filter_parse(char **pbuf, struct filter_pred *pred)
+static struct filter_pred *create_logical_pred(int op)
-{
+{
-        char *tmp, *tok, *val_str = NULL;
+        struct filter_pred *pred;
-        int tok_n = 0;
+        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-        /* field ==/!= number, or/and field ==/!= number, number */
+        if (!pred)
-        while ((tok = strsep(pbuf, " \n"))) {
+                return NULL;
-                if (tok_n == 0) {
-                        if (!strcmp(tok, "0")) {
+        pred->op = op;
-                                pred->clear = 1;
-                                return 0;
+        return pred;
-                        } else if (!strcmp(tok, "&&")) {
+}
-                                pred->or = 0;
-                                pred->compound = 1;
+static int check_preds(struct filter_parse_state *ps)
-                        } else if (!strcmp(tok, "||")) {
+{
-                                pred->or = 1;
+        int n_normal_preds = 0, n_logical_preds = 0;
-                                pred->compound = 1;
+        struct postfix_elt *elt;
-                        } else
-                                pred->field_name = tok;
+        list_for_each_entry(elt, &ps->postfix, list) {
-                        tok_n = 1;
+                if (elt->op == OP_NONE)
+                        continue;
+                if (elt->op == OP_AND || elt->op == OP_OR) {
+                        n_logical_preds++;
                        continue;
                }
-                if (tok_n == 1) {
+                n_normal_preds++;
-                        if (!pred->field_name)
+        }
-                                pred->field_name = tok;
-                        else if (!strcmp(tok, "!="))
+        if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
-                                pred->not = 1;
+                parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
-                        else if (!strcmp(tok, "=="))
+                return -EINVAL;
-                                pred->not = 0;
+        }
+        return 0;
+}
+static int replace_preds(struct event_subsystem *system,
+                         struct ftrace_event_call *call,
+                         struct filter_parse_state *ps,
+                         char *filter_string)
+{
+        char *operand1 = NULL, *operand2 = NULL;
+        struct filter_pred *pred;
+        struct postfix_elt *elt;
+        int err;
+        err = check_preds(ps);
+        if (err)
+                return err;
+        list_for_each_entry(elt, &ps->postfix, list) {
+                if (elt->op == OP_NONE) {
+                        if (!operand1)
+                                operand1 = elt->operand;
+                        else if (!operand2)
+                                operand2 = elt->operand;
                        else {
-                                pred->field_name = NULL;
+                                parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
                                return -EINVAL;
                        }
-                        tok_n = 2;
                        continue;
                }
-                if (tok_n == 2) {
-                        if (pred->compound) {
+                if (elt->op == OP_AND || elt->op == OP_OR) {
-                                if (!strcmp(tok, "!="))
+                        pred = create_logical_pred(elt->op);
-                                        pred->not = 1;
+                        if (!pred)
-                                else if (!strcmp(tok, "=="))
+                                return -ENOMEM;
-                                        pred->not = 0;
+                        if (call) {
-                                else {
+                                err = filter_add_pred(ps, call, pred);
-                                        pred->field_name = NULL;
+                                filter_free_pred(pred);
-                                        return -EINVAL;
-                                }
                        } else {
-                                val_str = tok;
+                                err = filter_add_subsystem_pred(ps, system,
-                                break; /* done */
+                                                        pred, filter_string);
+                                if (err)
+                                        filter_free_pred(pred);
                        }
-                        tok_n = 3;
+                        if (err)
+                                return err;
+                        operand1 = operand2 = NULL;
                        continue;
                }
-                if (tok_n == 3) {
-                        val_str = tok;
+                if (!operand1 || !operand2) {
-                        break; /* done */
+                        parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+                        return -EINVAL;
+                }
+                pred = create_pred(elt->op, operand1, operand2);
+                if (!pred)
+                        return -ENOMEM;
+                if (call) {
+                        err = filter_add_pred(ps, call, pred);
+                        filter_free_pred(pred);
+                } else {
+                        err = filter_add_subsystem_pred(ps, system, pred,
+                                                        filter_string);
+                        if (err)
+                                filter_free_pred(pred);
                }
+                if (err)
+                        return err;
+                operand1 = operand2 = NULL;
        }
-        if (!val_str) {
+        return 0;
-                pred->field_name = NULL;
+}
-                return -EINVAL;
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+        int err;
+        struct filter_parse_state *ps;
+        mutex_lock(&event_mutex);
+        if (!strcmp(strstrip(filter_string), "0")) {
+                filter_disable_preds(call);
+                remove_filter_string(call->filter);
+                mutex_unlock(&event_mutex);
+                return 0;
        }
-        pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+        err = -ENOMEM;
-        if (!pred->field_name)
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-                return -ENOMEM;
+        if (!ps)
+                goto out_unlock;
-        pred->val = simple_strtoull(val_str, &tmp, 0);
+        filter_disable_preds(call);
-        if (tmp == val_str) {
+        replace_filter_string(call->filter, filter_string);
-                pred->str_val = kstrdup(val_str, GFP_KERNEL);
-                if (!pred->str_val)
-                        return -ENOMEM;
-        } else if (*tmp != '\0')
-                return -EINVAL;
-        return 0;
+        parse_init(ps, filter_ops, filter_string);
+        err = filter_parse(ps);
+        if (err) {
+                append_filter_err(ps, call->filter);
+                goto out;
+        }
+        err = replace_preds(NULL, call, ps, filter_string);
+        if (err)
+                append_filter_err(ps, call->filter);
+out:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
+out_unlock:
+        mutex_unlock(&event_mutex);
+        return err;
 }
+int apply_subsystem_event_filter(struct event_subsystem *system,
+                                 char *filter_string)
+{
+        int err;
+        struct filter_parse_state *ps;
+        mutex_lock(&event_mutex);
+        if (!strcmp(strstrip(filter_string), "0")) {
+                filter_free_subsystem_preds(system);
+                remove_filter_string(system->filter);
+                mutex_unlock(&event_mutex);
+                return 0;
+        }
+        err = -ENOMEM;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                goto out_unlock;
+        filter_free_subsystem_preds(system);
+        replace_filter_string(system->filter, filter_string);
+        parse_init(ps, filter_ops, filter_string);
+        err = filter_parse(ps);
+        if (err) {
+                append_filter_err(ps, system->filter);
+                goto out;
+        }
+        err = replace_preds(system, NULL, ps, filter_string);
+        if (err)
+                append_filter_err(ps, system->filter);
+out:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
+out_unlock:
+        mutex_unlock(&event_mutex);
+        return err;
+}
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 38985f9b379c..000000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *      struct trace_entry              ent;
- *      <type>                          <item>;
- *      <type2>                         <item2>[<len>];
- *      [...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-#undef __array
-#define __array(type, item, len)        type    item[len];
-#undef __field
-#define __field(type, item)             type    item;
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print)  \
-        struct ftrace_raw_##name {                              \
-                struct trace_entry      ent;                    \
-                tstruct                                         \
-        };                                                      \
-        static struct ftrace_event_call event_##name
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index d363c6672c6c..000000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *      struct trace_seq *s = &iter->seq;
- *      struct ftrace_raw_<call> *field; <-- defined in stage 1
- *      struct trace_entry *entry;
- *      int ret;
- *
- *      entry = iter->ent;
- *
- *      if (entry->type != event_<call>.id) {
- *              WARN_ON_ONCE(1);
- *              return TRACE_TYPE_UNHANDLED;
- *      }
- *
- *      field = (typeof(field))entry;
- *
- *      ret = trace_seq_printf(s, <TP_printk> "\n");
- *      if (!ret)
- *              return TRACE_TYPE_PARTIAL_LINE;
- *
- *      return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-#undef __entry
-#define __entry field
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)          \
-enum print_line_t                                                       \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)        \
-{                                                                       \
-        struct trace_seq *s = &iter->seq;                               \
-        struct ftrace_raw_##call *field;                                \
-        struct trace_entry *entry;                                      \
-        int ret;                                                        \
-                                                                        \
-        entry = iter->ent;                                              \
-                                                                        \
-        if (entry->type != event_##call.id) {                           \
-                WARN_ON_ONCE(1);                                        \
-                return TRACE_TYPE_UNHANDLED;                            \
-        }                                                               \
-                                                                        \
-        field = (typeof(field))entry;                                   \
-                                                                        \
-        ret = trace_seq_printf(s, #call ": " print);                    \
-        if (!ret)                                                       \
-                return TRACE_TYPE_PARTIAL_LINE;                         \
-                                                                        \
-        return TRACE_TYPE_HANDLED;                                      \
-}
-        
-#include <trace/trace_event_types.h>
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *      struct ftrace_raw_##call field;
- *      int ret;
- *
- *      ret = trace_seq_printf(s, #type " " #item ";"
- *                             " offset:%u; size:%u;\n",
- *                             offsetof(struct ftrace_raw_##call, item),
- *                             sizeof(field.type));
- *
- * }
- */
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-#undef __field
-#define __field(type, item)                                     \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%u;\tsize:%u;\n",                \
-                               (unsigned int)offsetof(typeof(field), item), \
-                               (unsigned int)sizeof(field.item));       \
-        if (!ret)                                                       \
-                return 0;
-#undef __array
-#define __array(type, item, len)                                                \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"    \
-                               "offset:%u;\tsize:%u;\n",                \
-                               (unsigned int)offsetof(typeof(field), item), \
-                               (unsigned int)sizeof(field.item));       \
-        if (!ret)                                                       \
-                return 0;
-#undef __entry
-#define __entry REC
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)            \
-static int                                                              \
-ftrace_format_##call(struct trace_seq *s)                               \
-{                                                                       \
-        struct ftrace_raw_##call field;                                 \
-        int ret;                                                        \
-                                                                        \
-        tstruct;                                                        \
-                                                                        \
-        trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                        \
-        return ret;                                                     \
-}
-#include <trace/trace_event_types.h>
-#undef __field
-#define __field(type, item)                                             \
-        ret = trace_define_field(event_call, #type, #item,              \
-                                 offsetof(typeof(field), item),         \
-                                 sizeof(field.item));                   \
-        if (ret)                                                        \
-                return ret;
-#undef __array
-#define __array(type, item, len)                                        \
-        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
-                                 offsetof(typeof(field), item),         \
-                                 sizeof(field.item));                   \
-        if (ret)                                                        \
-                return ret;
-#define __common_field(type, item)                                      \
-        ret = trace_define_field(event_call, #type, "common_" #item,    \
-                                 offsetof(typeof(field.ent), item),     \
-                                 sizeof(field.ent.item));               \
-        if (ret)                                                        \
-                return ret;
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)            \
-int                                                                     \
-ftrace_define_fields_##call(void)                                       \
-{                                                                       \
-        struct ftrace_raw_##call field;                                 \
-        struct ftrace_event_call *event_call = &event_##call;           \
-        int ret;                                                        \
-                                                                        \
-        __common_field(unsigned char, type);                            \
-        __common_field(unsigned char, flags);                           \
-        __common_field(unsigned char, preempt_count);                   \
-        __common_field(int, pid);                                       \
-        __common_field(int, tgid);                                      \
-                                                                        \
-        tstruct;                                                        \
-                                                                        \
-        return ret;                                                     \
-}
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 9d2fa78cecca..000000000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- *      event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- *      int ret;
- *
- *      ret = register_trace_<call>(ftrace_event_<call>);
- *      if (!ret)
- *              pr_info("event trace: Could not activate trace point "
- *                      "probe to  <call>");
- *      return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *      unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *      .name                   = "<call>",
- *      .regfunc                = ftrace_reg_event_<call>,
- *      .unregfunc              = ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- *      struct ring_buffer_event *event;
- *      struct ftrace_raw_<call> *entry; <-- defined in stage 1
- *      unsigned long irq_flags;
- *      int pc;
- *
- *      local_save_flags(irq_flags);
- *      pc = preempt_count();
- *
- *      event = trace_current_buffer_lock_reserve(event_<call>.id,
- *                                sizeof(struct ftrace_raw_<call>),
- *                                irq_flags, pc);
- *      if (!event)
- *              return;
- *      entry   = ring_buffer_event_data(event);
- *
- *      <assign>;  <-- Here we assign the entries by the __field and
- *                      __array macros.
- *
- *      trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- *      int ret;
- *
- *      ret = register_trace_<call>(ftrace_raw_event_<call>);
- *      if (!ret)
- *              pr_info("event trace: Could not activate trace point "
- *                      "probe to <call>");
- *      return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *      unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- *      .trace                  = ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- *      int id;
- *
- *      id = register_ftrace_event(&ftrace_event_type_<call>);
- *      if (!id)
- *              return -ENODEV;
- *      event_<call>.id = id;
- *      return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *      .name                   = "<call>",
- *      .system                 = "<system>",
- *      .raw_init               = ftrace_raw_init_event_<call>,
- *      .regfunc                = ftrace_reg_event_<call>,
- *      .unregfunc              = ftrace_unreg_event_<call>,
- *      .show_format            = ftrace_format_<call>,
- * }
- *
- */
-#undef TP_FMT
-#define TP_FMT(fmt, args...)    fmt "\n", ##args
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)                               \
-static void ftrace_profile_##call(proto)                                \
-{                                                                       \
-        extern void perf_tpcounter_event(int);                          \
-        perf_tpcounter_event(event_##call.id);                          \
-}                                                                       \
-                                                                        \
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{                                                                       \
-        int ret = 0;                                                    \
-                                                                        \
-        if (!atomic_inc_return(&call->profile_count))                   \
-                ret = register_trace_##call(ftrace_profile_##call);     \
-                                                                        \
-        return ret;                                                     \
-}                                                                       \
-                                                                        \
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{                                                                       \
-        if (atomic_add_negative(-1, &call->profile_count))              \
-                unregister_trace_##call(ftrace_profile_##call);         \
-}
-#define _TRACE_PROFILE_INIT(call)                                       \
-        .profile_count = ATOMIC_INIT(-1),                               \
-        .profile_enable = ftrace_profile_enable_##call,                 \
-        .profile_disable = ftrace_profile_disable_##call,
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-#define _TRACE_FORMAT(call, proto, args, fmt)                           \
-static void ftrace_event_##call(proto)                                  \
-{                                                                       \
-        event_trace_printk(_RET_IP_, #call ": " fmt);                   \
-}                                                                       \
-                                                                        \
-static int ftrace_reg_event_##call(void)                                \
-{                                                                       \
-        int ret;                                                        \
-                                                                        \
-        ret = register_trace_##call(ftrace_event_##call);               \
-        if (ret)                                                        \
-                pr_info("event trace: Could not activate trace point "  \
-                        "probe to " #call "\n");                        \
-        return ret;                                                     \
-}                                                                       \
-                                                                        \
-static void ftrace_unreg_event_##call(void)                             \
-{                                                                       \
-        unregister_trace_##call(ftrace_event_##call);                   \
-}                                                                       \
-                                                                        \
-static struct ftrace_event_call event_##call;                           \
-                                                                        \
-static int ftrace_init_event_##call(void)                               \
-{                                                                       \
-        int id;                                                         \
-                                                                        \
-        id = register_ftrace_event(NULL);                               \
-        if (!id)                                                        \
-                return -ENODEV;                                         \
-        event_##call.id = id;                                           \
-        return 0;                                                       \
-}
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)                            \
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))           \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                       \
-static struct ftrace_event_call __used                                  \
-__attribute__((__aligned__(4)))                                         \
-__attribute__((section("_ftrace_events"))) event_##call = {             \
-        .name                   = #call,                                \
-        .system                 = __stringify(TRACE_SYSTEM),            \
-        .raw_init               = ftrace_init_event_##call,             \
-        .regfunc                = ftrace_reg_event_##call,              \
-        .unregfunc              = ftrace_unreg_event_##call,            \
-        _TRACE_PROFILE_INIT(call)                                       \
-}
-#undef __entry
-#define __entry entry
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)          \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                       \
-                                                                        \
-static struct ftrace_event_call event_##call;                           \
-                                                                        \
-static void ftrace_raw_event_##call(proto)                              \
-{                                                                       \
-        struct ftrace_event_call *call = &event_##call;                 \
-        struct ring_buffer_event *event;                                \
-        struct ftrace_raw_##call *entry;                                \
-        unsigned long irq_flags;                                        \
-        int pc;                                                         \
-                                                                        \
-        local_save_flags(irq_flags);                                    \
-        pc = preempt_count();                                           \
-                                                                        \
-        event = trace_current_buffer_lock_reserve(event_##call.id,      \
-                                  sizeof(struct ftrace_raw_##call),     \
-                                  irq_flags, pc);                       \
-        if (!event)                                                     \
-                return;                                                 \
-        entry   = ring_buffer_event_data(event);                        \
-                                                                        \
-        assign;                                                         \
-                                                                        \
-        if (call->preds && !filter_match_preds(call, entry))            \
-                ring_buffer_event_discard(event);                       \
-                                                                        \
-        trace_nowake_buffer_unlock_commit(event, irq_flags, pc);        \
-                                                                        \
-}                                                                       \
-                                                                        \
-static int ftrace_raw_reg_event_##call(void)                            \
-{                                                                       \
-        int ret;                                                        \
-                                                                        \
-        ret = register_trace_##call(ftrace_raw_event_##call);           \
-        if (ret)                                                        \
-                pr_info("event trace: Could not activate trace point "  \
-                        "probe to " #call "\n");                        \
-        return ret;                                                     \
-}                                                                       \
-                                                                        \
-static void ftrace_raw_unreg_event_##call(void)                         \
-{                                                                       \
-        unregister_trace_##call(ftrace_raw_event_##call);               \
-}                                                                       \
-                                                                        \
-static struct trace_event ftrace_event_type_##call = {                  \
-        .trace                  = ftrace_raw_output_##call,             \
-};                                                                      \
-                                                                        \
-static int ftrace_raw_init_event_##call(void)                           \
-{                                                                       \
-        int id;                                                         \
-                                                                        \
-        id = register_ftrace_event(&ftrace_event_type_##call);          \
-        if (!id)                                                        \
-                return -ENODEV;                                         \
-        event_##call.id = id;                                           \
-        INIT_LIST_HEAD(&event_##call.fields);                           \
-        return 0;                                                       \
-}                                                                       \
-                                                                        \
-static struct ftrace_event_call __used                                  \
-__attribute__((__aligned__(4)))                                         \
-__attribute__((section("_ftrace_events"))) event_##call = {             \
-        .name                   = #call,                                \
-        .system                 = __stringify(TRACE_SYSTEM),            \
-        .raw_init               = ftrace_raw_init_event_##call,         \
-        .regfunc                = ftrace_raw_reg_event_##call,          \
-        .unregfunc              = ftrace_raw_unreg_event_##call,        \
-        .show_format            = ftrace_format_##call,                 \
-        .define_fields          = ftrace_define_fields_##call,          \
-        _TRACE_PROFILE_INIT(call)                                       \
-}
-#include <trace/trace_event_types.h>
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf3..d06cf898dc86 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
 #undef TRACE_STRUCT
 #define TRACE_STRUCT(args...) args
+extern void __bad_type_size(void);
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)                                 \
+        if (sizeof(type) != sizeof(field.item))                         \
+                __bad_type_size();                                      \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
                               "offset:%u;\tsize:%u;\n",                \
                               (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)                       \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)                  \
        ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"   \
                               "offset:%u;\tsize:%u;\n",                \
                               (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
        if (!ret)                                                       \
                return 0;
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+        TRACE_FIELD(type, item, assign)
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s)				\
        return ret;                                                     \
 }
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,    \
+                                    tpfmt)                              \
+static int                                                              \
+ftrace_format_##call(struct trace_seq *s)                               \
+{                                                                       \
+        struct args field;                                              \
+        int ret;                                                        \
+                                                                        \
+        tstruct;                                                        \
+                                                                        \
+        trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);            \
+                                                                        \
+        return ret;                                                     \
+}
 #include "trace_event_types.h"
 #undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
        entry->item = assign;
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+        TRACE_FIELD(type, item, assign)
 #undef TP_CMD
 #define TP_CMD(cmd...)  cmd
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_ENTRY     entry
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)  \
        cmd;
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)      \
+int ftrace_define_fields_##call(void);                                  \
+static int ftrace_raw_init_event_##call(void);                          \
+                                                                        \
+struct ftrace_event_call __used                                         \
+__attribute__((__aligned__(4)))                                         \
+__attribute__((section("_ftrace_events"))) event_##call = {             \
+        .name                   = #call,                                \
+        .id                     = proto,                                \
+        .system                 = __stringify(TRACE_SYSTEM),            \
+        .raw_init               = ftrace_raw_init_event_##call,         \
+        .show_format            = ftrace_format_##call,                 \
+        .define_fields          = ftrace_define_fields_##call,          \
+};                                                                      \
+static int ftrace_raw_init_event_##call(void)                           \
+{                                                                       \
+        INIT_LIST_HEAD(&event_##call.fields);                           \
+        init_preds(&event_##call);                                      \
+        return 0;                                                       \
+}                                                                       \
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,    \
+                                    tpfmt)                              \
                                                                        \
-static struct ftrace_event_call __used                                  \
+struct ftrace_event_call __used                                         \
 __attribute__((__aligned__(4)))                                         \
 __attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
        .id                     = proto,                                \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .show_format            = ftrace_format_##call,                 \
+};
+#include "trace_event_types.h"
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)                                 \
+        ret = trace_define_field(event_call, #type, #item,              \
+                                 offsetof(typeof(field), item),         \
+                                 sizeof(field.item), is_signed_type(type));     \
+        if (ret)                                                        \
+                return ret;
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)                       \
+        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                                 offsetof(typeof(field), item),         \
+                                 sizeof(field.item), 0);                \
+        if (ret)                                                        \
+                return ret;
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)                 \
+        ret = trace_define_field(event_call, #type, #item,              \
+                                 offsetof(typeof(field), item),         \
+                                 sizeof(field.item), is_signed);        \
+        if (ret)                                                        \
+                return ret;
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)      \
+int                                                                     \
+ftrace_define_fields_##call(void)                                       \
+{                                                                       \
+        struct ftrace_event_call *event_call = &event_##call;           \
+        struct args field;                                              \
+        int ret;                                                        \
+                                                                        \
+        __common_field(unsigned char, type, 0);                         \
+        __common_field(unsigned char, flags, 0);                        \
+        __common_field(unsigned char, preempt_count, 0);                \
+        __common_field(int, pid, 1);                                    \
+        __common_field(int, tgid, 1);                                   \
+                                                                        \
+        tstruct;                                                        \
+                                                                        \
+        return ret;                                                     \
 }
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,    \
+                                    tpfmt)
 #include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..75ef000613c3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
 static void tracing_stop_function_trace(void)
 {
        ftrace_function_enabled = 0;
-        /* OK if they are not registered */
-        unregister_ftrace_function(&trace_stack_ops);
+        if (func_flags.val & TRACE_FUNC_OPT_STACK)
-        unregister_ftrace_function(&trace_ops);
+                unregister_ftrace_function(&trace_stack_ops);
+        else
+                unregister_ftrace_function(&trace_ops);
 }
 static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
        if (count == -1)
                seq_printf(m, ":unlimited\n");
        else
-                seq_printf(m, ":count=%ld", count);
+                seq_printf(m, ":count=%ld\n", count);
-        seq_putc(m, '\n');
        return 0;
 }
@@ -362,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 out_reg:
        ret = register_ftrace_function_probe(glob, ops, count);
-        return ret;
+        return ret < 0 ? ret : 0;
 }
 static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a7..420ec3487579 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+                         unsigned long frame_pointer)
 {
        unsigned long long calltime;
        int index;
@@ -65,6 +66,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
        if (!current->ret_stack)
                return -EBUSY;
+        /*
+         * We must make sure the ret_stack is tested before we read
+         * anything else.
+         */
+        smp_rmb();
        /* The return trace stack is full */
        if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
                atomic_inc(&current->trace_overrun);
@@ -78,14 +85,17 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
        current->ret_stack[index].ret = ret;
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
+        current->ret_stack[index].subtime = 0;
+        current->ret_stack[index].fp = frame_pointer;
        *depth = index;
        return 0;
 }
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+                        unsigned long frame_pointer)
 {
        int index;
@@ -99,28 +109,52 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
                return;
        }
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+        /*
+         * The arch may choose to record the frame pointer used
+         * and check it here to make sure that it is what we expect it
+         * to be. If gcc does not set the place holder of the return
+         * address in the frame pointer, and does a copy instead, then
+         * the function graph trace will fail. This test detects this
+         * case.
+         *
+         * Currently, x86_32 with optimize for size (-Os) makes the latest
+         * gcc do the above.
+         */
+        if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+                ftrace_graph_stop();
+                WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+                     "  from func %pF return to %lx\n",
+                     current->ret_stack[index].fp,
+                     frame_pointer,
+                     (void *)current->ret_stack[index].func,
+                     current->ret_stack[index].ret);
+                *ret = (unsigned long)panic;
+                return;
+        }
+#endif
        *ret = current->ret_stack[index].ret;
        trace->func = current->ret_stack[index].func;
        trace->calltime = current->ret_stack[index].calltime;
        trace->overrun = atomic_read(&current->trace_overrun);
        trace->depth = index;
-        barrier();
-        current->curr_ret_stack--;
 }
 /*
 * Send the trace to the ring-buffer.
 * @return the original return address.
 */
-unsigned long ftrace_return_to_handler(void)
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 {
        struct ftrace_graph_ret trace;
        unsigned long ret;
-        ftrace_pop_return_trace(&trace, &ret);
+        ftrace_pop_return_trace(&trace, &ret, frame_pointer);
        trace.rettime = trace_clock_local();
        ftrace_graph_return(&trace);
+        barrier();
+        current->curr_ret_stack--;
        if (unlikely(!ret)) {
                ftrace_graph_stop();
@@ -426,8 +460,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t
+enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
        unsigned long nsecs_rem = do_div(duration, 1000);
        /* log10(ULONG_MAX) + '\0' */
@@ -464,12 +498,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
+        return TRACE_TYPE_HANDLED;
+}
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+        int ret;
+        ret = trace_print_graph_duration(duration, s);
+        if (ret != TRACE_TYPE_HANDLED)
+                return ret;
        ret = trace_seq_printf(s, "|  ");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
-        return TRACE_TYPE_HANDLED;
+        return TRACE_TYPE_HANDLED;
 }
 /* Case of a leaf function on its call entry */
@@ -798,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
        switch (entry->type) {
        case TRACE_GRAPH_ENT: {
-                struct ftrace_graph_ent_entry *field;
+                /*
+                 * print_graph_entry() may consume the current event,
+                 * thus @field may become invalid, so we need to save it.
+                 * sizeof(struct ftrace_graph_ent_entry) is very small,
+                 * it can be safely saved at the stack.
+                 */
+                struct ftrace_graph_ent_entry *field, saved;
                trace_assign_type(field, entry);
-                return print_graph_entry(field, s, iter);
+                saved = *field;
+                return print_graph_entry(&saved, s, iter);
        }
        case TRACE_GRAPH_RET: {
                struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347f..ca7d7c4d0c2a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,10 +1,9 @@
 /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
 *
 * Copyright (C) 2008-2009 Intel Corporation.
 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
 */
-#include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
@@ -15,110 +14,119 @@
 #include <asm/ds.h>
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
-/*
+static void bts_trace_init_cpu(int cpu)
- * Start tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_start_cpu(void *arg)
 {
-        if (this_tracer)
+        per_cpu(tracer, cpu) =
-                ds_release_bts(this_tracer);
+                ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+                                   NULL, (size_t)-1, BTS_KERNEL);
-        this_tracer =
-                ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+        if (IS_ERR(per_cpu(tracer, cpu)))
-                               /* ovfl = */ NULL, /* th = */ (size_t)-1,
+                per_cpu(tracer, cpu) = NULL;
-                               BTS_KERNEL);
-        if (IS_ERR(this_tracer)) {
-                this_tracer = NULL;
-                return;
-        }
 }
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
-        spin_lock(&bts_tracer_lock);
+        int cpu;
+        hw_branch_trace = tr;
+        trace_hw_branches_enabled = 0;
-        on_each_cpu(bts_trace_start_cpu, NULL, 1);
+        get_online_cpus();
-        trace_hw_branches_enabled = 1;
+        for_each_online_cpu(cpu) {
+                bts_trace_init_cpu(cpu);
-        spin_unlock(&bts_tracer_lock);
+                if (likely(per_cpu(tracer, cpu)))
+                        trace_hw_branches_enabled = 1;
+        }
+        trace_hw_branches_suspended = 0;
+        put_online_cpus();
+        /* If we could not enable tracing on a single cpu, we fail. */
+        return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
 }
-/*
+static void bts_trace_reset(struct trace_array *tr)
- * Stop tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_stop_cpu(void *arg)
 {
-        if (this_tracer) {
+        int cpu;
-                ds_release_bts(this_tracer);
-                this_tracer = NULL;
+        get_online_cpus();
+        for_each_online_cpu(cpu) {
+                if (likely(per_cpu(tracer, cpu))) {
+                        ds_release_bts(per_cpu(tracer, cpu));
+                        per_cpu(tracer, cpu) = NULL;
+                }
        }
+        trace_hw_branches_enabled = 0;
+        trace_hw_branches_suspended = 0;
+        put_online_cpus();
 }
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_start(struct trace_array *tr)
 {
-        spin_lock(&bts_tracer_lock);
+        int cpu;
-        trace_hw_branches_enabled = 0;
+        get_online_cpus();
-        on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+        for_each_online_cpu(cpu)
+                if (likely(per_cpu(tracer, cpu)))
+                        ds_resume_bts(per_cpu(tracer, cpu));
+        trace_hw_branches_suspended = 0;
+        put_online_cpus();
+}
-        spin_unlock(&bts_tracer_lock);
+static void bts_trace_stop(struct trace_array *tr)
+{
+        int cpu;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                if (likely(per_cpu(tracer, cpu)))
+                        ds_suspend_bts(per_cpu(tracer, cpu));
+        trace_hw_branches_suspended = 1;
+        put_online_cpus();
 }
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
                                     unsigned long action, void *hcpu)
 {
-        unsigned int cpu = (unsigned long)hcpu;
+        int cpu = (long)hcpu;
-        spin_lock(&bts_tracer_lock);
-        if (!trace_hw_branches_enabled)
-                goto out;
        switch (action) {
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-                smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+                /* The notification is sent with interrupts enabled. */
+                if (trace_hw_branches_enabled) {
+                        bts_trace_init_cpu(cpu);
+                        if (trace_hw_branches_suspended &&
+                            likely(per_cpu(tracer, cpu)))
+                                ds_suspend_bts(per_cpu(tracer, cpu));
+                }
                break;
        case CPU_DOWN_PREPARE:
-                smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+                /* The notification is sent with interrupts enabled. */
-                break;
+                if (likely(per_cpu(tracer, cpu))) {
+                        ds_release_bts(per_cpu(tracer, cpu));
+                        per_cpu(tracer, cpu) = NULL;
+                }
        }
- out:
-        spin_unlock(&bts_tracer_lock);
        return NOTIFY_DONE;
 }
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
        .notifier_call = bts_hotcpu_handler
 };
-static int bts_trace_init(struct trace_array *tr)
-{
-        hw_branch_trace = tr;
-        bts_trace_start(tr);
-        return 0;
-}
-static void bts_trace_reset(struct trace_array *tr)
-{
-        bts_trace_stop(tr);
-}
 static void bts_trace_print_header(struct seq_file *m)
 {
        seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 {
+        unsigned long symflags = TRACE_ITER_SYM_OFFSET;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *seq = &iter->seq;
        struct hw_branch_entry *it;
-        unsigned long symflags = TRACE_ITER_SYM_OFFSET;
        trace_assign_type(it, entry);
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 void trace_hw_branch(u64 from, u64 to)
 {
+        struct ftrace_event_call *call = &event_hw_branch;
        struct trace_array *tr = hw_branch_trace;
        struct ring_buffer_event *event;
        struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
        entry->ent.type = TRACE_HW_BRANCHES;
        entry->from = from;
        entry->to   = to;
-        trace_buffer_unlock_commit(tr, event, 0, 0);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                trace_buffer_unlock_commit(tr, event, 0, 0);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
 * Collect the trace on the current cpu and write it into the ftrace buffer.
 *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
 */
 static void trace_bts_cpu(void *arg)
 {
-        struct trace_array *tr = (struct trace_array *) arg;
+        struct trace_array *tr = (struct trace_array *)arg;
        const struct bts_trace *trace;
        unsigned char *at;
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
        if (unlikely(!this_tracer))
                return;
-        ds_suspend_bts(this_tracer);
        trace = ds_read_bts(this_tracer);
        if (!trace)
-                goto out;
+                return;
        for (at = trace->ds.top; (void *)at < trace->ds.end;
             at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
        for (at = trace->ds.begin; (void *)at < trace->ds.top;
             at += trace->ds.size)
                trace_bts_at(trace, at);
-out:
-        ds_resume_bts(this_tracer);
 }
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-        spin_lock(&bts_tracer_lock);
+        int cpu;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                if (likely(per_cpu(tracer, cpu)))
+                        ds_suspend_bts(per_cpu(tracer, cpu));
+        /*
+         * We need to collect the trace on the respective cpu since ftrace
+         * implicitly adds the record for the current cpu.
+         * Once that is more flexible, we could collect the data from any cpu.
+         */
        on_each_cpu(trace_bts_cpu, iter->tr, 1);
-        spin_unlock(&bts_tracer_lock);
+        for_each_online_cpu(cpu)
+                if (likely(per_cpu(tracer, cpu)))
+                        ds_resume_bts(per_cpu(tracer, cpu));
+        put_online_cpus();
 }
 static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 void trace_hw_branch_oops(void)
 {
-        spin_lock(&bts_tracer_lock);
+        if (this_tracer) {
+                ds_suspend_bts_noirq(this_tracer);
-        trace_bts_cpu(hw_branch_trace);
+                trace_bts_cpu(hw_branch_trace);
+                ds_resume_bts_noirq(this_tracer);
-        spin_unlock(&bts_tracer_lock);
+        }
 }
 struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
        .start          = bts_trace_start,
        .stop           = bts_trace_stop,
        .open           = trace_bts_prepare,
-        .close          = trace_bts_close
+        .close          = trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest       = trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
 };
 __init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b4..d53b45ed0806 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/time.h>
 #include <asm/atomic.h>
 #include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
        struct mmiotrace_rw *rw;
        struct trace_seq *s     = &iter->seq;
        unsigned long long t    = ns2usecs(iter->ts);
-        unsigned long usec_rem  = do_div(t, 1000000ULL);
+        unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
        int ret = 1;
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
        struct mmiotrace_map *m;
        struct trace_seq *s     = &iter->seq;
        unsigned long long t    = ns2usecs(iter->ts);
-        unsigned long usec_rem  = do_div(t, 1000000ULL);
+        unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
        int ret;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64b54a59c55b..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,11 +14,24 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE  128
-static DEFINE_MUTEX(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+        seq_write(m, s->buffer, len);
+        trace_seq_init(s);
+}
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
@@ -84,6 +97,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        return len;
 }
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+        int len = (PAGE_SIZE - 1) - s->len;
+        int ret;
+        if (!len)
+                return 0;
+        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+        /* If we can't write it all, don't bother writing anything */
+        if (ret >= len)
+                return 0;
+        s->len += ret;
+        return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
@@ -201,6 +247,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
        return 0;
 }
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+                       unsigned long flags,
+                       const struct trace_print_flags *flag_array)
+{
+        unsigned long mask;
+        const char *str;
+        const char *ret = p->buffer + p->len;
+        int i;
+        for (i = 0;  flag_array[i].name && flags; i++) {
+                mask = flag_array[i].mask;
+                if ((flags & mask) != mask)
+                        continue;
+                str = flag_array[i].name;
+                flags &= ~mask;
+                if (p->len && delim)
+                        trace_seq_puts(p, delim);
+                trace_seq_puts(p, str);
+        }
+        /* check for left over flags */
+        if (flags) {
+                if (p->len && delim)
+                        trace_seq_puts(p, delim);
+                trace_seq_printf(p, "0x%lx", flags);
+        }
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_flags_seq);
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+                         const struct trace_print_flags *symbol_array)
+{
+        int i;
+        const char *ret = p->buffer + p->len;
+        for (i = 0;  symbol_array[i].name; i++) {
+                if (val != symbol_array[i].mask)
+                        continue;
+                trace_seq_puts(p, symbol_array[i].name);
+                break;
+        }
+        if (!p->len)
+                trace_seq_printf(p, "0x%lx", val);
+                
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
@@ -311,17 +418,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
                if (ip == ULONG_MAX || !ret)
                        break;
-                if (i && ret)
+                if (ret)
-                        ret = trace_seq_puts(s, " <- ");
+                        ret = trace_seq_puts(s, " => ");
                if (!ip) {
                        if (ret)
                                ret = trace_seq_puts(s, "??");
+                        if (ret)
+                                ret = trace_seq_puts(s, "\n");
                        continue;
                }
                if (!ret)
                        break;
                if (ret)
                        ret = seq_print_user_ip(s, mm, ip, sym_flags);
+                ret = trace_seq_puts(s, "\n");
        }
        if (mm)
@@ -455,6 +565,7 @@ static int task_state_char(unsigned long state)
 * @type: the type of event to look for
 *
 * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
 */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -464,7 +575,7 @@ struct trace_event *ftrace_find_event(int type)
        key = type & (EVENT_HASHSIZE - 1);
-        hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+        hlist_for_each_entry(event, n, &event_hash[key], node) {
                if (event->type == type)
                        return event;
        }
@@ -472,6 +583,46 @@ struct trace_event *ftrace_find_event(int type)
        return NULL;
 }
+static LIST_HEAD(ftrace_event_list);
+static int trace_search_list(struct list_head **list)
+{
+        struct trace_event *e;
+        int last = __TRACE_LAST_TYPE;
+        if (list_empty(&ftrace_event_list)) {
+                *list = &ftrace_event_list;
+                return last + 1;
+        }
+        /*
+         * We used up all possible max events,
+         * lets see if somebody freed one.
+         */
+        list_for_each_entry(e, &ftrace_event_list, list) {
+                if (e->type != last + 1)
+                        break;
+                last++;
+        }
+        /* Did we used up all 65 thousand events??? */
+        if ((last + 1) > FTRACE_MAX_EVENT)
+                return 0;
+        *list = &e->list;
+        return last + 1;
+}
+void trace_event_read_lock(void)
+{
+        down_read(&trace_event_mutex);
+}
+void trace_event_read_unlock(void)
+{
+        up_read(&trace_event_mutex);
+}
 /**
 * register_ftrace_event - register output for an event type
 * @event: the event type to register
@@ -492,22 +643,42 @@ int register_ftrace_event(struct trace_event *event)
        unsigned key;
        int ret = 0;
-        mutex_lock(&trace_event_mutex);
+        down_write(&trace_event_mutex);
-        if (!event) {
+        if (WARN_ON(!event))
-                ret = next_event_type++;
                goto out;
-        }
-        if (!event->type)
+        INIT_LIST_HEAD(&event->list);
-                event->type = next_event_type++;
-        else if (event->type > __TRACE_LAST_TYPE) {
+        if (!event->type) {
+                struct list_head *list = NULL;
+                if (next_event_type > FTRACE_MAX_EVENT) {
+                        event->type = trace_search_list(&list);
+                        if (!event->type)
+                                goto out;
+                } else {
+                        
+                        event->type = next_event_type++;
+                        list = &ftrace_event_list;
+                }
+                if (WARN_ON(ftrace_find_event(event->type)))
+                        goto out;
+                list_add_tail(&event->list, list);
+        } else if (event->type > __TRACE_LAST_TYPE) {
                printk(KERN_WARNING "Need to add type to trace.h\n");
                WARN_ON(1);
-        }
-        if (ftrace_find_event(event->type))
                goto out;
+        } else {
+                /* Is this event already used */
+                if (ftrace_find_event(event->type))
+                        goto out;
+        }
        if (event->trace == NULL)
                event->trace = trace_nop_print;
@@ -520,14 +691,25 @@ int register_ftrace_event(struct trace_event *event)
        key = event->type & (EVENT_HASHSIZE - 1);
-        hlist_add_head_rcu(&event->node, &event_hash[key]);
+        hlist_add_head(&event->node, &event_hash[key]);
        ret = event->type;
 out:
-        mutex_unlock(&trace_event_mutex);
+        up_write(&trace_event_mutex);
        return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_event);
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+        hlist_del(&event->node);
+        list_del(&event->list);
+        return 0;
+}
 /**
 * unregister_ftrace_event - remove a no longer used event
@@ -535,12 +717,13 @@ int register_ftrace_event(struct trace_event *event)
 */
 int unregister_ftrace_event(struct trace_event *event)
 {
-        mutex_lock(&trace_event_mutex);
+        down_write(&trace_event_mutex);
-        hlist_del(&event->node);
+        __unregister_ftrace_event(event);
-        mutex_unlock(&trace_event_mutex);
+        up_write(&trace_event_mutex);
        return 0;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 /*
 * Standard events
@@ -833,14 +1016,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
+        if (!trace_seq_puts(s, "<stack trace>\n"))
+                goto partial;
        for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-                if (i) {
+                if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
-                        if (!trace_seq_puts(s, " <= "))
+                        break;
-                                goto partial;
+                if (!trace_seq_puts(s, " => "))
+                        goto partial;
-                        if (!seq_print_ip_sym(s, field->caller[i], flags))
+                if (!seq_print_ip_sym(s, field->caller[i], flags))
-                                goto partial;
+                        goto partial;
-                }
                if (!trace_seq_puts(s, "\n"))
                        goto partial;
        }
@@ -868,10 +1053,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
-        if (!seq_print_userip_objs(field, s, flags))
+        if (!trace_seq_puts(s, "<user stack trace>\n"))
                goto partial;
-        if (!trace_seq_putc(s, '\n'))
+        if (!seq_print_userip_objs(field, s, flags))
                goto partial;
        return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index e0bde39c2dd9..d38bec4a9c30 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,41 +1,17 @@
 #ifndef __TRACE_EVENTS_H
 #define __TRACE_EVENTS_H
+#include <linux/trace_seq.h>
 #include "trace.h"
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-                                              int flags);
-struct trace_event {
-        struct hlist_node       node;
-        int                     type;
-        trace_print_func        trace;
-        trace_print_func        raw;
-        trace_print_func        hex;
-        trace_print_func        binary;
-};
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
                unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-                                 size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-                                size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
                                 struct trace_seq *s, unsigned long sym_flags);
 extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
                                         int flags);
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
 #define MAX_MEMHEX_BYTES        8
 #define HEX_CHARS               (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 118439709fb7..8a30d9874cd4 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
 static void probe_power_end(struct power_trace *it)
 {
+        struct ftrace_event_call *call = &event_power;
        struct ring_buffer_event *event;
        struct trace_power *entry;
        struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
                goto out;
        entry   = ring_buffer_event_data(event);
        entry->state_data = *it;
-        trace_buffer_unlock_commit(tr, event, 0, 0);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                trace_buffer_unlock_commit(tr, event, 0, 0);
 out:
        preempt_enable();
 }
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
 static void probe_power_mark(struct power_trace *it, unsigned int type,
                                unsigned int level)
 {
+        struct ftrace_event_call *call = &event_power;
        struct ring_buffer_event *event;
        struct trace_power *entry;
        struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
                goto out;
        entry   = ring_buffer_event_data(event);
        entry->state_data = *it;
-        trace_buffer_unlock_commit(tr, event, 0, 0);
+        if (!filter_check_discard(call, entry, tr->buffer, event))
+                trace_buffer_unlock_commit(tr, event, 0, 0);
 out:
        preempt_enable();
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107fe..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_start(struct seq_file *m, loff_t *pos)
 {
-        const char **fmt = m->private;
+        const char **fmt = __start___trace_bprintk_fmt + *pos;
-        const char **next = fmt;
-        (*pos)++;
        if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
                return NULL;
-        next = fmt;
-        m->private = ++next;
        return fmt;
 }
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
 {
-        return t_next(m, NULL, pos);
+        (*pos)++;
+        return t_start(m, pos);
 }
 static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
        const char *str = *fmt;
        int i;
-        seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+        seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
        /*
         * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
 static int
 ftrace_formats_open(struct inode *inode, struct file *file)
 {
-        int ret;
+        return seq_open(file, &show_format_seq_ops);
-        ret = seq_open(file, &show_format_seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = __start___trace_bprintk_fmt;
-        }
-        return ret;
 }
 static const struct file_operations ftrace_formats_fops = {
@@ -245,17 +231,13 @@ static const struct file_operations ftrace_formats_fops = {
 static __init int init_trace_printk_function_export(void)
 {
        struct dentry *d_tracer;
-        struct dentry *entry;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                return 0;
-        entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+        trace_create_file("printk_formats", 0444, d_tracer,
                                    NULL, &ftrace_formats_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'printk_formats' entry\n");
        return 0;
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1ae..a98106dd979c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 #include "trace.h"
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
        int cpu;
        int pc;
-        if (!sched_ref || sched_stopped)
+        if (unlikely(!sched_ref))
                return;
        tracing_record_cmdline(prev);
        tracing_record_cmdline(next);
-        if (!tracer_enabled)
+        if (!tracer_enabled || sched_stopped)
                return;
        pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
        unsigned long flags;
        int cpu, pc;
-        if (!likely(tracer_enabled))
+        if (unlikely(!sched_ref))
                return;
-        pc = preempt_count();
        tracing_record_cmdline(current);
-        if (sched_stopped)
+        if (!tracer_enabled || sched_stopped)
                return;
+        pc = preempt_count();
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153e..eacb27225173 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 #include "trace.h"
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
        pc = preempt_count();
-        /* The task we are waiting for is waking up */
-        data = wakeup_trace->data[wakeup_cpu];
        /* disable local data, not wakeup_cpu data */
        cpu = raw_smp_processor_id();
        disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
        if (unlikely(!tracer_enabled || next != wakeup_task))
                goto out_unlock;
+        /* The task we are waiting for is waking up */
+        data = wakeup_trace->data[wakeup_cpu];
        trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 08f4eb2763d1..00dd6485bdd7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_BRANCH:
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
+        case TRACE_HW_BRANCHES:
                return 1;
        }
        return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 #else
 # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
 #endif /* CONFIG_DYNAMIC_FTRACE */
 /*
 * Simple verification test of ftrace function tracer.
 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
        return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+                                   struct trace_array *tr)
+{
+        struct trace_iterator *iter;
+        struct tracer tracer;
+        unsigned long count;
+        int ret;
+        if (!trace->open) {
+                printk(KERN_CONT "missing open function...");
+                return -1;
+        }
+        ret = tracer_init(trace, tr);
+        if (ret) {
+                warn_failed_init_tracer(trace, ret);
+                return ret;
+        }
+        /*
+         * The hw-branch tracer needs to collect the trace from the various
+         * cpu trace buffers - before tracing is stopped.
+         */
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter)
+                return -ENOMEM;
+        memcpy(&tracer, trace, sizeof(tracer));
+        iter->trace = &tracer;
+        iter->tr = tr;
+        iter->pos = -1;
+        mutex_init(&iter->mutex);
+        trace->open(iter);
+        mutex_destroy(&iter->mutex);
+        kfree(iter);
+        tracing_stop();
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        tracing_start();
+        if (!ret && !count) {
+                printk(KERN_CONT "no entries found..");
+                ret = -1;
+        }
+        return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f9661..6a2a9d484cd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
                seq_printf(m, "        Depth    Size   Location"
                           "    (%d entries)\n"
                           "        -----    ----   --------\n",
-                           max_stack_trace.nr_entries);
+                           max_stack_trace.nr_entries - 1);
                if (!stack_tracer_enabled && !max_stack_size)
                        print_disabled(m);
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
 static int stack_trace_open(struct inode *inode, struct file *file)
 {
-        int ret;
+        return seq_open(file, &stack_trace_seq_ops);
-        ret = seq_open(file, &stack_trace_seq_ops);
-        return ret;
 }
 static const struct file_operations stack_trace_fops = {
        .open           = stack_trace_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
+        .release        = seq_release,
 };
 int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
        ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
        if (ret || !write ||
-            (last_stack_tracer_enabled == stack_tracer_enabled))
+            (last_stack_tracer_enabled == !!stack_tracer_enabled))
                goto out;
-        last_stack_tracer_enabled = stack_tracer_enabled;
+        last_stack_tracer_enabled = !!stack_tracer_enabled;
        if (stack_tracer_enabled)
                register_ftrace_function(&trace_ops);
@@ -352,19 +349,14 @@ __setup("stacktrace", enable_stacktrace);
 static __init int stack_trace_init(void)
 {
        struct dentry *d_tracer;
-        struct dentry *entry;
        d_tracer = tracing_init_dentry();
-        entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
+        trace_create_file("stack_max_size", 0644, d_tracer,
-                                    &max_stack_size, &stack_max_size_fops);
+                        &max_stack_size, &stack_max_size_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'stack_max_size' entry\n");
-        entry = debugfs_create_file("stack_trace", 0444, d_tracer,
+        trace_create_file("stack_trace", 0444, d_tracer,
-                                    NULL, &stack_trace_fops);
+                        NULL, &stack_trace_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs 'stack_trace' entry\n");
        if (stack_tracer_enabled)
                register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index acdebd771a93..aea321c82fa0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
 /*
 * Infrastructure for statistic tracing (histogram output).
 *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
 *
 * Based on the code from trace_branch.c which is
 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,22 +10,27 @@
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
 #include "trace.h"
-/* List of stat entries from a tracer */
+/*
-struct trace_stat_list {
+ * List of stat red-black nodes from a tracer
-        struct list_head        list;
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+        struct rb_node          node;
        void                    *stat;
 };
 /* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
        struct list_head        session_list;
        struct tracer_stat      *ts;
-        struct list_head        stat_list;
+        struct rb_root          stat_root;
        struct mutex            stat_mutex;
        struct dentry           *file;
 };
@@ -37,77 +42,136 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 /* The root directory for all stat files */
 static struct dentry            *stat_dir;
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+        struct stat_node *snode;
+        struct rb_node *parent = rb_parent(node);
+        if (node->rb_left)
+                return node->rb_left;
+        else if (node->rb_right)
+                return node->rb_right;
+        else {
+                if (!parent)
+                        ;
+                else if (parent->rb_left == node)
+                        parent->rb_left = NULL;
+                else
+                        parent->rb_right = NULL;
+                snode = container_of(node, struct stat_node, node);
+                kfree(snode);
+                return parent;
+        }
+}
-static void reset_stat_session(struct tracer_stat_session *session)
+static void __reset_stat_session(struct stat_session *session)
 {
-        struct trace_stat_list *node, *next;
+        struct rb_node *node = session->stat_root.rb_node;
-        list_for_each_entry_safe(node, next, &session->stat_list, list)
+        while (node)
-                kfree(node);
+                node = release_next(node);
-        INIT_LIST_HEAD(&session->stat_list);
+        session->stat_root = RB_ROOT;
 }
-static void destroy_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
+{
+        mutex_lock(&session->stat_mutex);
+        __reset_stat_session(session);
+        mutex_unlock(&session->stat_mutex);
+}
+static void destroy_session(struct stat_session *session)
 {
        debugfs_remove(session->file);
-        reset_stat_session(session);
+        __reset_stat_session(session);
        mutex_destroy(&session->stat_mutex);
        kfree(session);
 }
+typedef int (*cmp_stat_t)(void *, void *);
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct stat_node *data;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        data->stat = stat;
+        /*
+         * Figure out where to put new node
+         * This is a descendent sorting
+         */
+        while (*new) {
+                struct stat_node *this;
+                int result;
+                this = container_of(*new, struct stat_node, node);
+                result = cmp(data->stat, this->stat);
+                parent = *new;
+                if (result >= 0)
+                        new = &((*new)->rb_left);
+                else
+                        new = &((*new)->rb_right);
+        }
+        rb_link_node(&data->node, parent, new);
+        rb_insert_color(&data->node, root);
+        return 0;
+}
 /*
 * For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
+ * This one will force an insertion as right-most node
- * the list.
+ * in the rbtree.
 */
 static int dummy_cmp(void *p1, void *p2)
 {
-        return 1;
+        return -1;
 }
 /*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
 * All of these copies and sorting are required on all opening
 * since the stats could have changed between two file sessions.
 */
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
 {
-        struct trace_stat_list *iter_entry, *new_entry;
        struct tracer_stat *ts = session->ts;
+        struct rb_root *root = &session->stat_root;
        void *stat;
        int ret = 0;
        int i;
        mutex_lock(&session->stat_mutex);
-        reset_stat_session(session);
+        __reset_stat_session(session);
        if (!ts->stat_cmp)
                ts->stat_cmp = dummy_cmp;
-        stat = ts->stat_start();
+        stat = ts->stat_start(ts);
        if (!stat)
                goto exit;
-        /*
+        ret = insert_stat(root, stat, ts->stat_cmp);
-         * The first entry. Actually this is the second, but the first
+        if (ret)
-         * one (the stat_list head) is pointless.
-         */
-        new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
-        if (!new_entry) {
-                ret = -ENOMEM;
                goto exit;
-        }
-        INIT_LIST_HEAD(&new_entry->list);
-        list_add(&new_entry->list, &session->stat_list);
-        new_entry->stat = stat;
        /*
-         * Iterate over the tracer stat entries and store them in a sorted
+         * Iterate over the tracer stat entries and store them in an rbtree.
-         * list.
         */
        for (i = 1; ; i++) {
                stat = ts->stat_next(stat, i);
@@ -116,37 +180,17 @@ static int stat_seq_init(struct tracer_stat_session *session)
                if (!stat)
                        break;
-                new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+                ret = insert_stat(root, stat, ts->stat_cmp);
-                if (!new_entry) {
+                if (ret)
-                        ret = -ENOMEM;
+                        goto exit_free_rbtree;
-                        goto exit_free_list;
-                }
-                INIT_LIST_HEAD(&new_entry->list);
-                new_entry->stat = stat;
-                list_for_each_entry_reverse(iter_entry, &session->stat_list,
-                                list) {
-                        /* Insertion with a descendent sorting */
-                        if (ts->stat_cmp(iter_entry->stat,
-                                        new_entry->stat) >= 0) {
-                                list_add(&new_entry->list, &iter_entry->list);
-                                break;
-                        }
-                }
-                /* The current larger value */
-                if (list_empty(&new_entry->list))
-                        list_add(&new_entry->list, &session->stat_list);
        }
 exit:
        mutex_unlock(&session->stat_mutex);
        return ret;
-exit_free_list:
+exit_free_rbtree:
-        reset_stat_session(session);
+        __reset_stat_session(session);
        mutex_unlock(&session->stat_mutex);
        return ret;
 }
@@ -154,38 +198,47 @@ exit_free_list:
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-        struct tracer_stat_session *session = s->private;
+        struct stat_session *session = s->private;
+        struct rb_node *node;
+        int i;
-        /* Prevent from tracer switch or stat_list modification */
+        /* Prevent from tracer switch or rbtree modification */
        mutex_lock(&session->stat_mutex);
        /* If we are in the beginning of the file, print the headers */
        if (!*pos && session->ts->stat_headers)
                return SEQ_START_TOKEN;
-        return seq_list_start(&session->stat_list, *pos);
+        node = rb_first(&session->stat_root);
+        for (i = 0; node && i < *pos; i++)
+                node = rb_next(node);
+        return node;
 }
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-        struct tracer_stat_session *session = s->private;
+        struct stat_session *session = s->private;
+        struct rb_node *node = p;
+        (*pos)++;
        if (p == SEQ_START_TOKEN)
-                return seq_list_start(&session->stat_list, *pos);
+                return rb_first(&session->stat_root);
-        return seq_list_next(p, &session->stat_list, pos);
+        return rb_next(node);
 }
 static void stat_seq_stop(struct seq_file *s, void *p)
 {
-        struct tracer_stat_session *session = s->private;
+        struct stat_session *session = s->private;
        mutex_unlock(&session->stat_mutex);
 }
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-        struct tracer_stat_session *session = s->private;
+        struct stat_session *session = s->private;
-        struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+        struct stat_node *l = container_of(v, struct stat_node, node);
        if (v == SEQ_START_TOKEN)
                return session->ts->stat_headers(s);
@@ -204,31 +257,34 @@ static const struct seq_operations trace_stat_seq_ops = {
 static int tracing_stat_open(struct inode *inode, struct file *file)
 {
        int ret;
+        struct seq_file *m;
+        struct stat_session *session = inode->i_private;
-        struct tracer_stat_session *session = inode->i_private;
+        ret = stat_seq_init(session);
+        if (ret)
+                return ret;
        ret = seq_open(file, &trace_stat_seq_ops);
-        if (!ret) {
+        if (ret) {
-                struct seq_file *m = file->private_data;
+                reset_stat_session(session);
-                m->private = session;
+                return ret;
-                ret = stat_seq_init(session);
        }
+        m = file->private_data;
+        m->private = session;
        return ret;
 }
 /*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
 */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-        struct tracer_stat_session *session = i->i_private;
+        struct stat_session *session = i->i_private;
-        mutex_lock(&session->stat_mutex);
        reset_stat_session(session);
-        mutex_unlock(&session->stat_mutex);
-        return 0;
+        return seq_release(i, f);
 }
 static const struct file_operations tracing_stat_fops = {
@@ -251,7 +307,7 @@ static int tracing_stat_init(void)
        return 0;
 }
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
 {
        if (!stat_dir && tracing_stat_init())
                return -ENODEV;
@@ -266,7 +322,7 @@ static int init_stat_file(struct tracer_stat_session *session)
 int register_stat_tracer(struct tracer_stat *trace)
 {
-        struct tracer_stat_session *session, *node, *tmp;
+        struct stat_session *session, *node;
        int ret;
        if (!trace)
@@ -277,7 +333,7 @@ int register_stat_tracer(struct tracer_stat *trace)
        /* Already registered? */
        mutex_lock(&all_stat_sessions_mutex);
-        list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+        list_for_each_entry(node, &all_stat_sessions, session_list) {
                if (node->ts == trace) {
                        mutex_unlock(&all_stat_sessions_mutex);
                        return -EINVAL;
@@ -286,15 +342,13 @@ int register_stat_tracer(struct tracer_stat *trace)
        mutex_unlock(&all_stat_sessions_mutex);
        /* Init the session */
-        session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+        session = kzalloc(sizeof(*session), GFP_KERNEL);
        if (!session)
                return -ENOMEM;
        session->ts = trace;
        INIT_LIST_HEAD(&session->session_list);
-        INIT_LIST_HEAD(&session->stat_list);
        mutex_init(&session->stat_mutex);
-        session->file = NULL;
        ret = init_stat_file(session);
        if (ret) {
@@ -312,7 +366,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 void unregister_stat_tracer(struct tracer_stat *trace)
 {
-        struct tracer_stat_session *node, *tmp;
+        struct stat_session *node, *tmp;
        mutex_lock(&all_stat_sessions_mutex);
        list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3d..f3546a2cd826 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
        /* The name of your stat file */
        const char              *name;
        /* Iteration over statistic entries */
-        void                    *(*stat_start)(void);
+        void                    *(*stat_start)(struct tracer_stat *trace);
        void                    *(*stat_next)(void *prev, int idx);
        /* Compare two entries for stats sorting */
        int                     (*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149f..f6693969287d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = stack_trace_timer_fn;
-        hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+                      HRTIMER_MODE_REL_PINNED);
 }
 static void start_stack_timers(void)
@@ -321,11 +322,7 @@ static const struct file_operations sysprof_sample_fops = {
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 {
-        struct dentry *entry;
-        entry = debugfs_create_file("sysprof_sample_period", 0644,
+        trace_create_file("sysprof_sample_period", 0644,
                        d_tracer, NULL, &sysprof_sample_fops);
-        if (entry)
-                return;
-        pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 797201e4a137..97fcea4acce1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
 */
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
 #include "trace_stat.h"
@@ -16,8 +16,6 @@
 /* A cpu workqueue thread */
 struct cpu_workqueue_stats {
        struct list_head            list;
-/* Useful to know if we print the cpu headers */
-        bool                        first_entry;
        int                         cpu;
        pid_t                       pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-        struct cpu_workqueue_stats *node, *next;
+        struct cpu_workqueue_stats *node;
        unsigned long flags;
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
+        list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
-                                                        list) {
                if (node->pid == wq_thread->pid) {
                        atomic_inc(&node->inserted);
                        goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-        struct cpu_workqueue_stats *node, *next;
+        struct cpu_workqueue_stats *node;
        unsigned long flags;
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
+        list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
-                                                        list) {
                if (node->pid == wq_thread->pid) {
                        node->executed++;
                        goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
        cws->pid = wq_thread->pid;
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        if (list_empty(&workqueue_cpu_stat(cpu)->list))
-                cws->first_entry = true;
        list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
        return ret;
 }
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
 {
        int cpu;
        void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 static int workqueue_stat_show(struct seq_file *s, void *p)
 {
        struct cpu_workqueue_stats *cws = p;
-        unsigned long flags;
-        int cpu = cws->cpu;
        struct pid *pid;
        struct task_struct *tsk;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-                seq_printf(s, "\n");
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
        pid = find_get_pid(cws->pid);
        if (pid) {
                tsk = get_pid_task(pid, PIDTYPE_PID);
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        atomic_inc(&user->__count);
-                        return user;
-                }
-        }
-        return NULL;
-}
 #ifdef CONFIG_USER_SCHED
 static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
 #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+        struct user_struct *user;
+        struct hlist_node *h;
+        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+                if (user->uid == uid) {
+                        /* possibly resurrect an "almost deleted" object */
+                        if (atomic_inc_return(&user->__count) == 1)
+                                cancel_delayed_work(&user->work);
+                        return user;
+                }
+        }
+        return NULL;
+}
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
        return uids_user_create(&root_user);
 }
-/* work function to remove sysfs directory for a user and free up
+/* delayed work function to remove sysfs directory for a user and free up
 * corresponding structures.
 */
 static void cleanup_user_struct(struct work_struct *w)
 {
-        struct user_struct *up = container_of(w, struct user_struct, work);
+        struct user_struct *up = container_of(w, struct user_struct, work.work);
        unsigned long flags;
        int remove_user = 0;
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
         */
        uids_mutex_lock();
-        local_irq_save(flags);
+        spin_lock_irqsave(&uidhash_lock, flags);
+        if (atomic_read(&up->__count) == 0) {
-        if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
                uid_hash_remove(up);
                remove_user = 1;
-                spin_unlock_irqrestore(&uidhash_lock, flags);
-        } else {
-                local_irq_restore(flags);
        }
+        spin_unlock_irqrestore(&uidhash_lock, flags);
        if (!remove_user)
                goto done;
@@ -331,16 +330,28 @@ done:
 */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
-        /* restore back the count */
-        atomic_inc(&up->__count);
        spin_unlock_irqrestore(&uidhash_lock, flags);
+        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        INIT_WORK(&up->work, cleanup_user_struct);
+        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        schedule_work(&up->work);
 }
 #else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+        struct user_struct *user;
+        struct hlist_node *h;
+        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+                if (user->uid == uid) {
+                        atomic_inc(&user->__count);
+                        return user;
+                }
+        }
+        return NULL;
+}
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af8..8a82b4b8ea52 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
 #include <linux/err.h>
 #include <linux/slab.h>
+static struct uts_namespace *create_uts_ns(void)
+{
+        struct uts_namespace *uts_ns;
+        uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+        if (uts_ns)
+                kref_init(&uts_ns->kref);
+        return uts_ns;
+}
 /*
 * Clone a new ns copying an original utsname, setting refcount to 1
 * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
-        ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+        ns = create_uts_ns();
        if (!ns)
                return ERR_PTR(-ENOMEM);
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
        up_read(&uts_sem);
-        kref_init(&ns->kref);
        return ns;
 }
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
-void init_waitqueue_head(wait_queue_head_t *q)
+void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
 {
        spin_lock_init(&q->lock);
+        lockdep_set_class(&q->lock, key);
        INIT_LIST_HEAD(&q->task_list);
 }
-EXPORT_SYMBOL(init_waitqueue_head);
+EXPORT_SYMBOL(__init_waitqueue_head);
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
@@ -154,7 +155,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-                __wake_up_common(q, mode, 1, 0, key);
+                __wake_up_locked_key(q, mode, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a08950..0668795d8818 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
-#include <trace/workqueue.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
 /*
 * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
        return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
-DEFINE_TRACE(workqueue_insertion);
 static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head)
 {
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
-DEFINE_TRACE(workqueue_execution);
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
        spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
        return cwq;
 }
-DEFINE_TRACE(workqueue_creation);
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
-DEFINE_TRACE(workqueue_destruction);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2009-09-14 00:16:56 -0400
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2009-09-14 00:16:56 -0400
commit	fc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
tree	f3cb97c4769b74f6627a59769f1ed5c92a13c58a /kernel
parent	2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent	9de48cc300fb10f7d9faa978670becf5e352462a (diff)