123 files changed, 6628 insertions, 2206 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b35..24e7cb0ba26 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
        bool "Preemptible Kernel (Low-Latency Desktop)"
+        select PREEMPT_COUNT
        help
          This option reduces the latency of the kernel by making
          all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 endchoice
+config PREEMPT_COUNT
+       bool
+\ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfcc8b4..eca595e2fd5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o range.o jump_label.o
+            async.o range.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += events/
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -125,11 +126,10 @@ targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
-quiet_cmd_ikconfiggz = IKCFG   $@
+      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
-      cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
-        $(call if_changed,ikconfiggz)
+        $(call filechk,ikconfiggz)
 $(obj)/time.o: $(obj)/timeconst.h
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c7..d5fe7af0de2 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
 */
 #include <linux/async.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
-#include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 2) run (and print duration) */
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
-                printk("calling  %lli_%pF @ %i\n", (long long)entry->cookie,
+                printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",
+                        (long long)entry->cookie,
                        entry->func, task_pid_nr(current));
                calltime = ktime_get();
        }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                rettime = ktime_get();
                delta = ktime_sub(rettime, calltime);
-                printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+                printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
                        (long long)entry->cookie,
                        entry->func,
                        (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
        ktime_t starttime, delta, endtime;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
-                printk("async_waiting @ %i\n", task_pid_nr(current));
+                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
        }
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
                endtime = ktime_get();
                delta = ktime_sub(endtime, starttime);
-                printk("async_continuing @ %i after %lli usec\n",
+                printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
                        task_pid_nr(current),
                        (long long)ktime_to_ns(delta) >> 10);
        }
diff --git a/kernel/audit.c b/kernel/audit.c
index 93950031706..0a1355ca3d7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,7 +43,7 @@
 #include <linux/init.h>
 #include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -55,6 +55,9 @@
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
 #include <linux/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
        }
 }
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+        u32 len;
+        char *secctx;
+        if (security_secid_to_secctx(secid, &secctx, &len)) {
+                audit_panic("Cannot convert secid to context");
+        } else {
+                audit_log_format(ab, " obj=%s", secctx);
+                security_release_secctx(secctx, len);
+        }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b12..5bf0790497e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
        atomic_inc(&tree->count);
 }
-static void __put_tree(struct rcu_head *rcu)
-{
-        struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
-        kfree(tree);
-}
 static inline void put_tree(struct audit_tree *tree)
 {
        if (atomic_dec_and_test(&tree->count))
-                call_rcu(&tree->head, __put_tree);
+                kfree_rcu(tree, head);
 }
 /* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 00d79df03e7..ce4b054acee 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -44,7 +44,7 @@
 #include <linux/init.h>
 #include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2731d115d72..54a36fe288f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,9 +27,11 @@
 */
 #include <linux/cgroup.h>
+#include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
@@ -59,7 +61,7 @@
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
@@ -268,6 +270,33 @@ static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+        if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+                wake_up_all(&cgroup_rmdir_waitq);
+}
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+        css_get(css);
+}
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+        cgroup_wakeup_rmdir_waiter(css->cgroup);
+        css_put(css);
+}
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
        /*
@@ -327,52 +356,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
-/* We don't maintain the lists running through each css_set to its
+static void free_css_set_work(struct work_struct *work)
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
-static void __put_css_set(struct css_set *cg, int taskexit)
 {
+        struct css_set *cg = container_of(work, struct css_set, work);
        struct cg_cgroup_link *link;
        struct cg_cgroup_link *saved_link;
-        /*
-         * Ensure that the refcount doesn't hit zero while any readers
-         * can see it. Similar to atomic_dec_and_lock(), but for an
-         * rwlock
-         */
-        if (atomic_add_unless(&cg->refcount, -1, 1))
-                return;
-        write_lock(&css_set_lock);
-        if (!atomic_dec_and_test(&cg->refcount)) {
-                write_unlock(&css_set_lock);
-                return;
-        }
-        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hlist_del(&cg->hlist);
-        css_set_count--;
+        write_lock(&css_set_lock);
        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
                                 cg_link_list) {
                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
-                if (atomic_dec_and_test(&cgrp->count) &&
+                if (atomic_dec_and_test(&cgrp->count)) {
-                    notify_on_release(cgrp)) {
-                        if (taskexit)
-                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
+                        cgroup_wakeup_rmdir_waiter(cgrp);
                }
                kfree(link);
        }
        write_unlock(&css_set_lock);
-        kfree_rcu(cg, rcu_head);
+        kfree(cg);
+}
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+        struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+        INIT_WORK(&cg->work, free_css_set_work);
+        schedule_work(&cg->work);
 }
+/* We don't maintain the lists running through each css_set to its
+ * task until after the first call to cgroup_iter_start(). This
+ * reduces the fork()/exit() overhead for people who have cgroups
+ * compiled into their kernel but not actually in use */
+static int use_task_css_set_links __read_mostly;
 /*
 * refcounted get/put for css_set objects
 */
@@ -381,14 +401,26 @@ static inline void get_css_set(struct css_set *cg)
        atomic_inc(&cg->refcount);
 }
-static inline void put_css_set(struct css_set *cg)
+static void put_css_set(struct css_set *cg)
 {
-        __put_css_set(cg, 0);
+        /*
-}
+         * Ensure that the refcount doesn't hit zero while any readers
+         * can see it. Similar to atomic_dec_and_lock(), but for an
+         * rwlock
+         */
+        if (atomic_add_unless(&cg->refcount, -1, 1))
+                return;
+        write_lock(&css_set_lock);
+        if (!atomic_dec_and_test(&cg->refcount)) {
+                write_unlock(&css_set_lock);
+                return;
+        }
-static inline void put_css_set_taskexit(struct css_set *cg)
+        hlist_del(&cg->hlist);
-{
+        css_set_count--;
-        __put_css_set(cg, 1);
+        write_unlock(&css_set_lock);
+        call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 /*
@@ -720,9 +752,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
 * another.  It does so using cgroup_mutex, however there are
 * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
@@ -912,33 +944,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 }
 /*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
-        if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
-                wake_up_all(&cgroup_rmdir_waitq);
-}
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
-        css_get(css);
-}
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
-        cgroup_wakeup_rmdir_waiter(css->cgroup);
-        css_put(css);
-}
-/*
 * Call with cgroup_mutex held. Drops reference counts on modules, including
 * any duplicate ones that parse_cgroupfs_options took. If this function
 * returns an error, no reference counts are touched.
@@ -1173,10 +1178,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        /*
         * If the 'all' option was specified select all the subsystems,
-         * otherwise 'all, 'none' and a subsystem name options were not
+         * otherwise if 'none', 'name=' and a subsystem name options
-         * specified, let's default to 'all'
+         * were not specified, let's default to 'all'
         */
-        if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+        if (all_ss || (!one_ss && !opts->none && !opts->name)) {
                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss == NULL)
@@ -1514,6 +1519,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct inode *inode;
                struct cgroupfs_root *existing_root;
+                const struct cred *cred;
                int i;
                BUG_ON(sb->s_root != NULL);
@@ -1593,7 +1599,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
+                cred = override_creds(&init_cred);
                cgroup_populate_dir(root_cgrp);
+                revert_creds(cred);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
        } else {
@@ -1697,7 +1705,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-                                                      rcu_read_lock_held() ||
                                                      cgroup_lock_is_held());
        if (!dentry || cgrp == dummytop) {
@@ -1723,7 +1730,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                        break;
                dentry = rcu_dereference_check(cgrp->dentry,
-                                               rcu_read_lock_held() ||
                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
@@ -1820,6 +1826,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct cgroupfs_root *root = cgrp->root;
+        struct css_set *cg;
        /* Nothing to do if the task is already in that cgroup */
        oldcgrp = task_cgroup_from_root(tsk, root);
@@ -1849,6 +1856,11 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                }
        }
+        task_lock(tsk);
+        cg = tsk->cgroups;
+        get_css_set(cg);
+        task_unlock(tsk);
        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
        if (retval)
                goto out;
@@ -1861,8 +1873,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                if (ss->attach)
                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
+        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-        synchronize_rcu();
+        /* put_css_set will not destroy cg until after an RCU grace period */
+        put_css_set(cg);
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -2095,11 +2108,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                        continue;
                /* get old css_set pointer */
                task_lock(tsk);
-                if (tsk->flags & PF_EXITING) {
-                        /* ignore this task if it's going away */
-                        task_unlock(tsk);
-                        continue;
-                }
                oldcg = tsk->cgroups;
                get_css_set(oldcg);
                task_unlock(tsk);
@@ -2189,6 +2197,24 @@ out_free_group_list:
        return retval;
 }
+static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+        struct cgroup_subsys *ss;
+        int ret;
+        for_each_subsys(cgrp->root, ss) {
+                if (ss->allow_attach) {
+                        ret = ss->allow_attach(cgrp, tsk);
+                        if (ret)
+                                return ret;
+                } else {
+                        return -EACCES;
+                }
+        }
+        return 0;
+}
 /*
 * Find the task_struct of the task to attach by vpid and pass it along to the
 * function to attach either it or all tasks in its threadgroup. Will take
@@ -2234,9 +2260,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
-                        rcu_read_unlock();
+                        /*
-                        cgroup_unlock();
+                         * if the default permission check fails, give each
-                        return -EACCES;
+                         * cgroup a chance to extend the permission check
+                         */
+                        ret = cgroup_allow_attach(cgrp, tsk);
+                        if (ret) {
+                                rcu_read_unlock();
+                                cgroup_unlock();
+                                return ret;
+                        }
                }
                get_task_struct(tsk);
                rcu_read_unlock();
@@ -3542,7 +3575,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        }
        /* the process need read permission on control file */
-        ret = file_permission(cfile, MAY_READ);
+        /* AV: shouldn't we check that it's been opened for read instead? */
+        ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
        if (ret < 0)
                goto fail;
@@ -3810,6 +3844,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (err < 0)
                goto err_remove;
+        set_bit(CGRP_RELEASABLE, &parent->flags);
        /* The cgroup directory was pre-locked for us */
        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
@@ -3941,6 +3977,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
        return !failed;
 }
+/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
+ * Must be called with css_set_lock held */
+static int cgroup_css_sets_empty(struct cgroup *cgrp)
+{
+        struct cg_cgroup_link *link;
+        list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+                struct css_set *cg = link->cg;
+                if (atomic_read(&cg->refcount) > 0)
+                        return 0;
+        }
+        return 1;
+}
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct cgroup *cgrp = dentry->d_fsdata;
@@ -3953,7 +4004,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_mutex already */
 again:
        mutex_lock(&cgroup_mutex);
-        if (atomic_read(&cgrp->count) != 0) {
+        if (!cgroup_css_sets_empty(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
@@ -3986,7 +4037,7 @@ again:
        mutex_lock(&cgroup_mutex);
        parent = cgrp->parent;
-        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+        if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
@@ -4026,7 +4077,6 @@ again:
        cgroup_d_remove_dir(d);
        dput(d);
-        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
        /*
@@ -4626,7 +4676,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        task_unlock(tsk);
        if (cg)
-                put_css_set_taskexit(cg);
+                put_css_set(cg);
 }
 /**
@@ -4680,6 +4730,14 @@ static void check_for_release(struct cgroup *cgrp)
 }
 /* Caller must verify that the css is not for root cgroup */
+void __css_get(struct cgroup_subsys_state *css, int count)
+{
+        atomic_add(count, &css->refcnt);
+        set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+}
+EXPORT_SYMBOL_GPL(__css_get);
+/* Caller must verify that the css is not for root cgroup */
 void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
@@ -4687,10 +4745,7 @@ void __css_put(struct cgroup_subsys_state *css, int count)
        rcu_read_lock();
        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
-                if (notify_on_release(cgrp)) {
+                check_for_release(cgrp);
-                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                        check_for_release(cgrp);
-                }
                cgroup_wakeup_rmdir_waiter(cgrp);
        }
        rcu_read_unlock();
@@ -4813,8 +4868,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-        cssid = rcu_dereference_check(css->id,
+        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->id;
@@ -4826,8 +4880,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
-        cssid = rcu_dereference_check(css->id,
+        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->depth;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e4..a3f638ac3de 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss,
        kfree(cgroup_freezer(cgroup));
 }
+/* task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
+{
+        return frozen(task) ||
+                (task_is_stopped_or_traced(task) && freezing(task));
+}
 /*
 * The call to cgroup_lock() in the freezer.state write method prevents
 * a write to that file racing against an attach, and hence the
@@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup,
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (frozen(task))
+                if (is_task_frozen_enough(task))
                        nfrozen++;
        }
@@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        while ((task = cgroup_iter_next(cgroup, &it))) {
                if (!freeze_task(task, true))
                        continue;
-                if (frozen(task))
+                if (is_task_frozen_enough(task))
                        continue;
                if (!freezing(task) && !freezer_should_skip(task))
                        num_cant_freeze_now++;
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd..e2435ee9993 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
                        __put_user(ts->tv_sec, &cts->tv_sec) ||
                        __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL_GPL(put_compat_timespec);
 static long compat_nanosleep_restart(struct restart_block *restart)
 {
@@ -890,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
        case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
        }
 }
+EXPORT_SYMBOL_GPL(sigset_from_compat);
 asmlinkage long
 compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
@@ -991,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
        sigset_from_compat(&newset, &newset32);
        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-        spin_lock_irq(&current->sighand->siglock);
        current->saved_sigmask = current->blocked;
-        current->blocked = newset;
+        set_current_blocked(&newset);
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        current->state = TASK_INTERRUPTIBLE;
        schedule();
diff --git a/kernel/configs.c b/kernel/configs.c
index b4066b44a99..42e8fa075ee 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
 module_init(ikconfig_init);
 module_exit(ikconfig_cleanup);
+#endif /* CONFIG_IKCONFIG_PROC */
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Randy Dunlap");
 MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
-#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b..eae3d9b3957 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,7 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/suspend.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void)
        return 0;
 }
 core_initcall(alloc_frozen_cpus);
+/*
+ * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
+ * hotplug when tasks are about to be frozen. Also, don't allow the freezer
+ * to continue until any currently running CPU hotplug operation gets
+ * completed.
+ * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
+ * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
+ * CPU hotplug path and released only after it is complete. Thus, we
+ * (and hence the freezer) will block here until any currently running CPU
+ * hotplug operation gets completed.
+ */
+void cpu_hotplug_disable_before_freeze(void)
+{
+        cpu_maps_update_begin();
+        cpu_hotplug_disabled = 1;
+        cpu_maps_update_done();
+}
+/*
+ * When tasks have been thawed, re-enable regular CPU hotplug (which had been
+ * disabled while beginning to freeze tasks).
+ */
+void cpu_hotplug_enable_after_thaw(void)
+{
+        cpu_maps_update_begin();
+        cpu_hotplug_disabled = 0;
+        cpu_maps_update_done();
+}
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+                        unsigned long action, void *ptr)
+{
+        switch (action) {
+        case PM_SUSPEND_PREPARE:
+        case PM_HIBERNATION_PREPARE:
+                cpu_hotplug_disable_before_freeze();
+                break;
+        case PM_POST_SUSPEND:
+        case PM_POST_HIBERNATION:
+                cpu_hotplug_enable_after_thaw();
+                break;
+        default:
+                return NOTIFY_DONE;
+        }
+        return NOTIFY_OK;
+}
+int cpu_hotplug_pm_sync_init(void)
+{
+        pm_notifier(cpu_hotplug_pm_callback, 0);
+        return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
 #endif /* CONFIG_PM_SLEEP_SMP */
 /**
@@ -594,3 +668,23 @@ void init_cpu_online(const struct cpumask *src)
 {
        cpumask_copy(to_cpumask(cpu_online_bits), src);
 }
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+void idle_notifier_register(struct notifier_block *n)
+{
+        atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+void idle_notifier_unregister(struct notifier_block *n)
+{
+        atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+void idle_notifier_call_chain(unsigned long val)
+{
+        atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9c9b7545c81..10131fdaff7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
@@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor)
 int cpuset_mem_spread_node(void)
 {
+        if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+                current->cpuset_mem_spread_rotor =
+                        node_random(&current->mems_allowed);
        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
 }
 int cpuset_slab_spread_node(void)
 {
+        if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+                current->cpuset_slab_spread_rotor =
+                        node_random(&current->mems_allowed);
        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index 174fa84eca3..8ef31f53c44 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -508,10 +508,8 @@ int commit_creds(struct cred *new)
                key_fsgid_changed(task);
        /* do it
-         * - What if a process setreuid()'s and this brings the
+         * RLIMIT_NPROC limits on user->processes have already been checked
-         *   new uid over his NPROC rlimit?  We can check this now
+         * in set_user().
-         *   cheaply with the new uid cache, so if it matters
-         *   we should be checking for it.  -DaveM
         */
        alter_cred_subscribers(new, 2);
        if (new->user != old->user)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index bad6786dee8..0d7c08784ef 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -51,7 +51,7 @@
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include "debug_core.h"
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a11db956dd6..34872482315 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -42,6 +42,8 @@
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
 static char                     remcom_out_buffer[BUFMAX];
+static int                      gdbstub_use_prev_in_buf;
+static int                      gdbstub_prev_in_buf_pos;
 /* Storage for the registers, in GDB format. */
 static unsigned long            gdb_regs[(NUMREGBYTES +
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void)
        int ret = -1;
        int i;
+        if (unlikely(gdbstub_use_prev_in_buf)) {
+                if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
+                        return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
+                else
+                        gdbstub_use_prev_in_buf = 0;
+        }
        /* poll any additional I/O interfaces that are defined */
        while (ret < 0)
                for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
@@ -109,7 +118,6 @@ static void get_packet(char *buffer)
                        buffer[count] = ch;
                        count = count + 1;
                }
-                buffer[count] = 0;
                if (ch == '#') {
                        xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
@@ -124,6 +132,7 @@ static void get_packet(char *buffer)
                        if (dbg_io_ops->flush)
                                dbg_io_ops->flush();
                }
+                buffer[count] = 0;
        } while (checksum != xmitcsum);
 }
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
        case 'c':
                strcpy(remcom_in_buffer, cmd);
                return 0;
-        case '?':
+        case '$':
-                gdb_cmd_status(ks);
+                strcpy(remcom_in_buffer, cmd);
-                break;
+                gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
-        case '\0':
+                gdbstub_prev_in_buf_pos = 0;
-                strcpy(remcom_out_buffer, "");
+                return 0;
-                break;
        }
        dbg_io_ops->write_char('+');
        put_packet(remcom_out_buffer);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 2f62fe85f16..7179eac7b41 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv)
        unsigned long addr;
        long offset;
-        kdbgetintenv("BTARGS", &argcount);      /* Arguments to print */
+        /* Prompt after each proc in bta */
-        kdbgetintenv("BTAPROMPT", &btaprompt);  /* Prompt after each
+        kdbgetintenv("BTAPROMPT", &btaprompt);
-                                                 * proc in bta */
        if (strcmp(argv[0], "bta") == 0) {
                struct task_struct *g, *p;
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
index 56c88e4db30..9834ad303ab 100644
--- a/kernel/debug/kdb/kdb_cmds
+++ b/kernel/debug/kdb/kdb_cmds
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging"
 endefcmd
 defcmd dumpall "" "First line debugging"
-  set BTSYMARG 1
-  set BTARGS 9
  pid R
  -dumpcommon
  -bta
 endefcmd
 defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
-  set BTSYMARG 1
-  set BTARGS 9
  pid R
  -dumpcommon
  -btc
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index dd0b1b7dd02..d9ca9aa481e 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs);
 int kdb_poll_idx = 1;
 EXPORT_SYMBOL_GPL(kdb_poll_idx);
+static struct kgdb_state *kdb_ks;
 int kdb_stub(struct kgdb_state *ks)
 {
        int error = 0;
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks)
        kdb_dbtrap_t db_result = KDB_DB_NOBPT;
        int i;
+        kdb_ks = ks;
        if (KDB_STATE(REENTRY)) {
                reason = KDB_REASON_SWITCH;
                KDB_STATE_CLEAR(REENTRY);
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks)
        KDB_STATE_CLEAR(PAGER);
        kdbnearsym_cleanup();
        if (error == KDB_CMD_KGDB) {
-                if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
+                if (KDB_STATE(DOING_KGDB))
-        /*
-         * This inteface glue which allows kdb to transition in into
-         * the gdb stub.  In order to do this the '?' or '' gdb serial
-         * packet response is processed here.  And then control is
-         * passed to the gdbstub.
-         */
-                        if (KDB_STATE(DOING_KGDB))
-                                gdbstub_state(ks, "?");
-                        else
-                                gdbstub_state(ks, "");
                        KDB_STATE_CLEAR(DOING_KGDB);
-                        KDB_STATE_CLEAR(DOING_KGDB2);
-                }
                return DBG_PASS_EVENT;
        }
        kdb_bp_install(ks->linux_regs);
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks)
        return kgdb_info[ks->cpu].ret_state;
 }
+void kdb_gdb_state_pass(char *buf)
+{
+        gdbstub_state(kdb_ks, buf);
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 96fdaac46a8..4802eb5840e 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN];
 int kdb_trap_printk;
-static void kgdb_transition_check(char *buffer)
+static int kgdb_transition_check(char *buffer)
 {
-        int slen = strlen(buffer);
+        if (buffer[0] != '+' && buffer[0] != '$') {
-        if (strncmp(buffer, "$?#3f", slen) != 0 &&
-            strncmp(buffer, "$qSupported#37", slen) != 0 &&
-            strncmp(buffer, "+$qSupported#37", slen) != 0) {
                KDB_STATE_SET(KGDB_TRANS);
                kdb_printf("%s", buffer);
+        } else {
+                int slen = strlen(buffer);
+                if (slen > 3 && buffer[slen - 3] == '#') {
+                        kdb_gdb_state_pass(buffer);
+                        strcpy(buffer, "kgdb");
+                        KDB_STATE_SET(DOING_KGDB);
+                        return 1;
+                }
        }
+        return 0;
 }
 static int kdb_read_get_key(char *buffer, size_t bufsize)
@@ -251,6 +257,10 @@ poll_again:
        case 13: /* enter */
                *lastchar++ = '\n';
                *lastchar++ = '\0';
+                if (!KDB_STATE(KGDB_TRANS)) {
+                        KDB_STATE_SET(KGDB_TRANS);
+                        kdb_printf("%s", buffer);
+                }
                kdb_printf("\n");
                return buffer;
        case 4: /* Del */
@@ -382,22 +392,26 @@ poll_again:
                                 * printed characters if we think that
                                 * kgdb is connecting, until the check
                                 * fails */
-                                if (!KDB_STATE(KGDB_TRANS))
+                                if (!KDB_STATE(KGDB_TRANS)) {
-                                        kgdb_transition_check(buffer);
+                                        if (kgdb_transition_check(buffer))
-                                else
+                                                return buffer;
+                                } else {
                                        kdb_printf("%c", key);
+                                }
                        }
                        /* Special escape to kgdb */
                        if (lastchar - buffer >= 5 &&
                            strcmp(lastchar - 5, "$?#3f") == 0) {
+                                kdb_gdb_state_pass(lastchar - 5);
                                strcpy(buffer, "kgdb");
                                KDB_STATE_SET(DOING_KGDB);
                                return buffer;
                        }
-                        if (lastchar - buffer >= 14 &&
+                        if (lastchar - buffer >= 11 &&
-                            strcmp(lastchar - 14, "$qSupported#37") == 0) {
+                            strcmp(lastchar - 11, "$qSupported") == 0) {
+                                kdb_gdb_state_pass(lastchar - 11);
                                strcpy(buffer, "kgdb");
-                                KDB_STATE_SET(DOING_KGDB2);
+                                KDB_STATE_SET(DOING_KGDB);
                                return buffer;
                        }
                }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index be14779bcef..63786e71a3c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -145,7 +145,6 @@ static char *__env[] = {
 #endif
 "RADIX=16",
 "MDCOUNT=8",                   /* lines of md output */
- "BTARGS=9",                    /* 9 possible args in bt */
 KDB_PLATFORM_ENV,
 "DTABCOUNT=30",
 "NOSECT=1",
@@ -172,6 +171,7 @@ static char *__env[] = {
 (char *)0,
 (char *)0,
 (char *)0,
+ (char *)0,
 };
 static const int __nenv = (sizeof(__env) / sizeof(char *));
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
                }
                if (result == KDB_CMD_KGDB) {
-                        if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+                        if (!KDB_STATE(DOING_KGDB))
                                kdb_printf("Entering please attach debugger "
                                           "or use $D#44+ or $3#33\n");
                        break;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 35d69ed1dfb..e381d105b40 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -21,7 +21,6 @@
 #define KDB_CMD_SS      (-1003)
 #define KDB_CMD_SSB     (-1004)
 #define KDB_CMD_KGDB (-1005)
-#define KDB_CMD_KGDB2 (-1006)
 /* Internal debug flags */
 #define KDB_DEBUG_FLAG_BP       0x0002  /* Breakpoint subsystem debug */
@@ -146,7 +145,6 @@ extern int kdb_state;
                                                 * keyboard on this cpu */
 #define KDB_STATE_KEXEC         0x00040000      /* kexec issued */
 #define KDB_STATE_DOING_KGDB    0x00080000      /* kgdb enter now issued */
-#define KDB_STATE_DOING_KGDB2   0x00100000      /* kgdb enter now issued */
 #define KDB_STATE_KGDB_TRANS    0x00200000      /* Transition to kgdb */
 #define KDB_STATE_ARCH          0xff000000      /* Reserved for arch
                                                 * specific use */
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
 extern void kdb_meminfo_proc_show(void);
 extern char *kdb_getstr(char *, size_t, char *);
+extern void kdb_gdb_state_pass(char *buf);
 /* Defines for kdb_symbol_print */
 #define KDB_SP_SPACEB   0x0001          /* Space before string */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa7..418b3f7053a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <linux/module.h>
 int delayacct_on __read_mostly = 1;     /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
 static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d839..89e5e8aa4c3 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
-obj-y := core.o
+obj-y := core.o ring_buffer.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe7108cca..0f857782d06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
+#include "internal.h"
 #include <asm/irq_regs.h>
 struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 }
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+                          struct perf_event_context *ctx)
+{
+        raw_spin_lock(&cpuctx->ctx.lock);
+        if (ctx)
+                raw_spin_lock(&ctx->lock);
+}
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+                            struct perf_event_context *ctx)
+{
+        if (ctx)
+                raw_spin_unlock(&ctx->lock);
+        raw_spin_unlock(&cpuctx->ctx.lock);
+}
 #ifdef CONFIG_CGROUP_PERF
 /*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                perf_pmu_disable(cpuctx->ctx.pmu);
                /*
                 * perf_cgroup_events says at least one
                 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                 * events for a context.
                 */
                if (cpuctx->ctx.nr_cgroups > 0) {
+                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                        perf_pmu_disable(cpuctx->ctx.pmu);
                        if (mode & PERF_CGROUP_SWOUT) {
                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                                cpuctx->cgrp = perf_cgroup_from_task(task);
                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                        }
+                        perf_pmu_enable(cpuctx->ctx.pmu);
+                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
                }
-                perf_pmu_enable(cpuctx->ctx.pmu);
        }
        rcu_read_unlock();
@@ -382,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
        local_irq_restore(flags);
 }
-static inline void perf_cgroup_sched_out(struct task_struct *task)
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+                                         struct task_struct *next)
 {
-        perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+        struct perf_cgroup *cgrp1;
+        struct perf_cgroup *cgrp2 = NULL;
+        /*
+         * we come here when we know perf_cgroup_events > 0
+         */
+        cgrp1 = perf_cgroup_from_task(task);
+        /*
+         * next is NULL when called from perf_event_enable_on_exec()
+         * that will systematically cause a cgroup_switch()
+         */
+        if (next)
+                cgrp2 = perf_cgroup_from_task(next);
+        /*
+         * only schedule out current cgroup events if we know
+         * that we are switching to a different cgroup. Otherwise,
+         * do no touch the cgroup events.
+         */
+        if (cgrp1 != cgrp2)
+                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 }
-static inline void perf_cgroup_sched_in(struct task_struct *task)
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+                                        struct task_struct *task)
 {
-        perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+        struct perf_cgroup *cgrp1;
+        struct perf_cgroup *cgrp2 = NULL;
+        /*
+         * we come here when we know perf_cgroup_events > 0
+         */
+        cgrp1 = perf_cgroup_from_task(task);
+        /* prev can never be NULL */
+        cgrp2 = perf_cgroup_from_task(prev);
+        /*
+         * only need to schedule in cgroup events if we are changing
+         * cgroup during ctxsw. Cgroup events were not scheduled
+         * out of ctxsw out if that was not the case.
+         */
+        if (cgrp1 != cgrp2)
+                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 }
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -501,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
 }
-static inline void perf_cgroup_sched_out(struct task_struct *task)
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+                                         struct task_struct *next)
 {
 }
-static inline void perf_cgroup_sched_in(struct task_struct *task)
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+                                        struct task_struct *task)
 {
 }
@@ -731,6 +790,7 @@ static u64 perf_event_time(struct perf_event *event)
 /*
 * Update the total_time_enabled and total_time_running fields for a event.
+ * The caller of this function needs to hold the ctx->lock.
 */
 static void update_event_times(struct perf_event *event)
 {
@@ -1105,6 +1165,10 @@ static int __perf_remove_from_context(void *info)
        raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
        list_del_event(event, ctx);
+        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+                ctx->is_active = 0;
+                cpuctx->task_ctx = NULL;
+        }
        raw_spin_unlock(&ctx->lock);
        return 0;
@@ -1454,8 +1518,24 @@ static void add_event_to_ctx(struct perf_event *event,
        event->tstamp_stopped = tstamp;
 }
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
+static void task_ctx_sched_out(struct perf_event_context *ctx);
-                                        struct task_struct *tsk);
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+             struct perf_cpu_context *cpuctx,
+             enum event_type_t event_type,
+             struct task_struct *task);
+static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
+                                struct perf_event_context *ctx,
+                                struct task_struct *task)
+{
+        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+        if (ctx)
+                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+        if (ctx)
+                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+}
 /*
 * Cross CPU call to install and enable a performance event
@@ -1466,20 +1546,37 @@ static int  __perf_install_in_context(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-        struct perf_event *leader = event->group_leader;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-        int err;
+        struct perf_event_context *task_ctx = cpuctx->task_ctx;
+        struct task_struct *task = current;
+        perf_ctx_lock(cpuctx, task_ctx);
+        perf_pmu_disable(cpuctx->ctx.pmu);
        /*
-         * In case we're installing a new context to an already running task,
+         * If there was an active task_ctx schedule it out.
-         * could also happen before perf_event_task_sched_in() on architectures
-         * which do context switches with IRQs enabled.
         */
-        if (ctx->task && !cpuctx->task_ctx)
+        if (task_ctx)
-                perf_event_context_sched_in(ctx, ctx->task);
+                task_ctx_sched_out(task_ctx);
+        /*
+         * If the context we're installing events in is not the
+         * active task_ctx, flip them.
+         */
+        if (ctx->task && task_ctx != ctx) {
+                if (task_ctx)
+                        raw_spin_unlock(&task_ctx->lock);
+                raw_spin_lock(&ctx->lock);
+                task_ctx = ctx;
+        }
+        if (task_ctx) {
+                cpuctx->task_ctx = task_ctx;
+                task = task_ctx->task;
+        }
+        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-        raw_spin_lock(&ctx->lock);
-        ctx->is_active = 1;
        update_context_time(ctx);
        /*
         * update cgrp time only if current cgrp
@@ -1490,43 +1587,13 @@ static int  __perf_install_in_context(void *info)
        add_event_to_ctx(event, ctx);
-        if (!event_filter_match(event))
-                goto unlock;
        /*
-         * Don't put the event on if it is disabled or if
+         * Schedule everything back in
-         * it is in a group and the group isn't on.
         */
-        if (event->state != PERF_EVENT_STATE_INACTIVE ||
+        perf_event_sched_in(cpuctx, task_ctx, task);
-            (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
-                goto unlock;
-        /*
+        perf_pmu_enable(cpuctx->ctx.pmu);
-         * An exclusive event can't go on if there are already active
+        perf_ctx_unlock(cpuctx, task_ctx);
-         * hardware events, and no hardware event can go on if there
-         * is already an exclusive event on.
-         */
-        if (!group_can_go_on(event, cpuctx, 1))
-                err = -EEXIST;
-        else
-                err = event_sched_in(event, cpuctx, ctx);
-        if (err) {
-                /*
-                 * This event couldn't go on.  If it is in a group
-                 * then we have to pull the whole group off.
-                 * If the event group is pinned then put it in error state.
-                 */
-                if (leader != event)
-                        group_sched_out(leader, cpuctx, ctx);
-                if (leader->attr.pinned) {
-                        update_group_times(leader);
-                        leader->state = PERF_EVENT_STATE_ERROR;
-                }
-        }
-unlock:
-        raw_spin_unlock(&ctx->lock);
        return 0;
 }
@@ -1739,7 +1806,7 @@ out:
        raw_spin_unlock_irq(&ctx->lock);
 }
-static int perf_event_refresh(struct perf_event *event, int refresh)
+int perf_event_refresh(struct perf_event *event, int refresh)
 {
        /*
         * not supported on inherited events
@@ -1752,36 +1819,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_refresh);
 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
 {
        struct perf_event *event;
+        int is_active = ctx->is_active;
-        raw_spin_lock(&ctx->lock);
+        ctx->is_active &= ~event_type;
-        perf_pmu_disable(ctx->pmu);
-        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
-                goto out;
+                return;
        update_context_time(ctx);
        update_cgrp_time_from_cpuctx(cpuctx);
        if (!ctx->nr_active)
-                goto out;
+                return;
-        if (event_type & EVENT_PINNED) {
+        perf_pmu_disable(ctx->pmu);
+        if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
        }
-        if (event_type & EVENT_FLEXIBLE) {
+        if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
        }
-out:
        perf_pmu_enable(ctx->pmu);
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1929,8 +1995,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        rcu_read_unlock();
        if (do_switch) {
+                raw_spin_lock(&ctx->lock);
                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
                cpuctx->task_ctx = NULL;
+                raw_spin_unlock(&ctx->lock);
        }
 }
@@ -1962,11 +2030,10 @@ void __perf_event_task_sched_out(struct task_struct *task,
         * cgroup event are system-wide mode only
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
-                perf_cgroup_sched_out(task);
+                perf_cgroup_sched_out(task, next);
 }
-static void task_ctx_sched_out(struct perf_event_context *ctx,
+static void task_ctx_sched_out(struct perf_event_context *ctx)
-                               enum event_type_t event_type)
 {
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
@@ -1976,7 +2043,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
-        ctx_sched_out(ctx, cpuctx, event_type);
+        ctx_sched_out(ctx, cpuctx, EVENT_ALL);
        cpuctx->task_ctx = NULL;
 }
@@ -2055,11 +2122,11 @@ ctx_sched_in(struct perf_event_context *ctx,
             struct task_struct *task)
 {
        u64 now;
+        int is_active = ctx->is_active;
-        raw_spin_lock(&ctx->lock);
+        ctx->is_active |= event_type;
-        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
-                goto out;
+                return;
        now = perf_clock();
        ctx->timestamp = now;
@@ -2068,15 +2135,12 @@ ctx_sched_in(struct perf_event_context *ctx,
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
-        if (event_type & EVENT_PINNED)
+        if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
                ctx_pinned_sched_in(ctx, cpuctx);
        /* Then walk through the lower prio flexible groups */
-        if (event_type & EVENT_FLEXIBLE)
+        if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
                ctx_flexible_sched_in(ctx, cpuctx);
-out:
-        raw_spin_unlock(&ctx->lock);
 }
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2152,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
        ctx_sched_in(ctx, cpuctx, event_type, task);
 }
-static void task_ctx_sched_in(struct perf_event_context *ctx,
-                              enum event_type_t event_type)
-{
-        struct perf_cpu_context *cpuctx;
-        cpuctx = __get_cpu_context(ctx);
-        if (cpuctx->task_ctx == ctx)
-                return;
-        ctx_sched_in(ctx, cpuctx, event_type, NULL);
-        cpuctx->task_ctx = ctx;
-}
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
                                        struct task_struct *task)
 {
@@ -2110,6 +2161,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
        if (cpuctx->task_ctx == ctx)
                return;
+        perf_ctx_lock(cpuctx, ctx);
        perf_pmu_disable(ctx->pmu);
        /*
         * We want to keep the following priority order:
@@ -2118,18 +2170,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+        perf_event_sched_in(cpuctx, ctx, task);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
-        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
        cpuctx->task_ctx = ctx;
+        perf_pmu_enable(ctx->pmu);
+        perf_ctx_unlock(cpuctx, ctx);
        /*
         * Since these rotations are per-cpu, we need to ensure the
         * cpu-context we got scheduled on is actually rotating.
         */
        perf_pmu_rotate_start(ctx->pmu);
-        perf_pmu_enable(ctx->pmu);
 }
 /*
@@ -2143,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
-void __perf_event_task_sched_in(struct task_struct *task)
+void __perf_event_task_sched_in(struct task_struct *prev,
+                                struct task_struct *task)
 {
        struct perf_event_context *ctx;
        int ctxn;
@@ -2161,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task)
         * cgroup event are system-wide mode only
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
-                perf_cgroup_sched_in(task);
+                perf_cgroup_sched_in(prev, task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2269,7 +2322,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
        u64 interrupts, now;
        s64 delta;
-        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2301,7 +2353,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
                if (delta > 0)
                        perf_adjust_period(event, period, delta);
        }
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -2309,16 +2360,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        raw_spin_lock(&ctx->lock);
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (!ctx->rotate_disable)
                list_rotate_left(&ctx->flexible_groups);
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -2345,6 +2392,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                        rotate = 1;
        }
+        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
        if (ctx)
@@ -2355,21 +2403,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
+                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
        rotate_ctx(&cpuctx->ctx);
        if (ctx)
                rotate_ctx(ctx);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
+        perf_event_sched_in(cpuctx, ctx, current);
-        if (ctx)
-                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
        perf_pmu_enable(cpuctx->ctx.pmu);
+        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 void perf_event_task_tick(void)
@@ -2423,10 +2470,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
         * ctxswin cgroup events which are already scheduled
         * in.
         */
-        perf_cgroup_sched_out(current);
+        perf_cgroup_sched_out(current, NULL);
-        task_ctx_sched_out(ctx, EVENT_ALL);
        raw_spin_lock(&ctx->lock);
+        task_ctx_sched_out(ctx);
        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
                ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2882,12 @@ retry:
                unclone_ctx(ctx);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
-        }
+        } else {
-        if (!ctx) {
                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
-                get_ctx(ctx);
                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
@@ -2856,14 +2899,14 @@ retry:
                else if (task->perf_event_ctxp[ctxn])
                        err = -EAGAIN;
                else {
+                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
                }
                mutex_unlock(&task->perf_event_mutex);
                if (unlikely(err)) {
-                        put_task_struct(task);
+                        put_ctx(ctx);
-                        kfree(ctx);
                        if (err == -EAGAIN)
                                goto retry;
@@ -2890,7 +2933,7 @@ static void free_event_rcu(struct rcu_head *head)
        kfree(event);
 }
-static void perf_buffer_put(struct perf_buffer *buffer);
+static void ring_buffer_put(struct ring_buffer *rb);
 static void free_event(struct perf_event *event)
 {
@@ -2913,9 +2956,9 @@ static void free_event(struct perf_event *event)
                }
        }
-        if (event->buffer) {
+        if (event->rb) {
-                perf_buffer_put(event->buffer);
+                ring_buffer_put(event->rb);
-                event->buffer = NULL;
+                event->rb = NULL;
        }
        if (is_cgroup_event(event))
@@ -2934,12 +2977,6 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
-        /*
-         * Remove from the PMU, can't get re-enabled since we got
-         * here because the last ref went.
-         */
-        perf_event_disable(event);
        WARN_ON_ONCE(ctx->parent_ctx);
        /*
         * There are two ways this annotation is useful:
@@ -2956,8 +2993,8 @@ int perf_event_release_kernel(struct perf_event *event)
        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
-        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
+        perf_remove_from_context(event);
        mutex_unlock(&ctx->mutex);
        free_event(event);
@@ -3149,13 +3186,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
        struct perf_event *event = file->private_data;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        unsigned int events = POLL_HUP;
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        rb = rcu_dereference(event->rb);
-        if (buffer)
+        if (rb)
-                events = atomic_xchg(&buffer->poll, 0);
+                events = atomic_xchg(&rb->poll, 0);
        rcu_read_unlock();
        poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3395,18 @@ static int perf_event_index(struct perf_event *event)
        return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
 }
+static void calc_timer_values(struct perf_event *event,
+                                u64 *enabled,
+                                u64 *running)
+{
+        u64 now, ctx_time;
+        now = perf_clock();
+        ctx_time = event->shadow_ctx_time + now;
+        *enabled = ctx_time - event->tstamp_enabled;
+        *running = ctx_time - event->tstamp_running;
+}
 /*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3415,25 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
+        u64 enabled, running;
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        /*
-        if (!buffer)
+         * compute total_time_enabled, total_time_running
+         * based on snapshot values taken when the event
+         * was last scheduled in.
+         *
+         * we cannot simply called update_context_time()
+         * because of locking issue as we can be called in
+         * NMI context
+         */
+        calc_timer_values(event, &enabled, &running);
+        rb = rcu_dereference(event->rb);
+        if (!rb)
                goto unlock;
-        userpg = buffer->user_page;
+        userpg = rb->user_page;
        /*
         * Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3447,10 @@ void perf_event_update_userpage(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                userpg->offset -= local64_read(&event->hw.prev_count);
-        userpg->time_enabled = event->total_time_enabled +
+        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);
-        userpg->time_running = event->total_time_running +
+        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
        barrier();
@@ -3400,220 +3460,10 @@ unlock:
        rcu_read_unlock();
 }
-static unsigned long perf_data_size(struct perf_buffer *buffer);
-static void
-perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
-{
-        long max_size = perf_data_size(buffer);
-        if (watermark)
-                buffer->watermark = min(max_size, watermark);
-        if (!buffer->watermark)
-                buffer->watermark = max_size / 2;
-        if (flags & PERF_BUFFER_WRITABLE)
-                buffer->writable = 1;
-        atomic_set(&buffer->refcount, 1);
-}
-#ifndef CONFIG_PERF_USE_VMALLOC
-/*
- * Back perf_mmap() with regular GFP_KERNEL-0 pages.
- */
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
-        if (pgoff > buffer->nr_pages)
-                return NULL;
-        if (pgoff == 0)
-                return virt_to_page(buffer->user_page);
-        return virt_to_page(buffer->data_pages[pgoff - 1]);
-}
-static void *perf_mmap_alloc_page(int cpu)
-{
-        struct page *page;
-        int node;
-        node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
-        if (!page)
-                return NULL;
-        return page_address(page);
-}
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
-        struct perf_buffer *buffer;
-        unsigned long size;
-        int i;
-        size = sizeof(struct perf_buffer);
-        size += nr_pages * sizeof(void *);
-        buffer = kzalloc(size, GFP_KERNEL);
-        if (!buffer)
-                goto fail;
-        buffer->user_page = perf_mmap_alloc_page(cpu);
-        if (!buffer->user_page)
-                goto fail_user_page;
-        for (i = 0; i < nr_pages; i++) {
-                buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
-                if (!buffer->data_pages[i])
-                        goto fail_data_pages;
-        }
-        buffer->nr_pages = nr_pages;
-        perf_buffer_init(buffer, watermark, flags);
-        return buffer;
-fail_data_pages:
-        for (i--; i >= 0; i--)
-                free_page((unsigned long)buffer->data_pages[i]);
-        free_page((unsigned long)buffer->user_page);
-fail_user_page:
-        kfree(buffer);
-fail:
-        return NULL;
-}
-static void perf_mmap_free_page(unsigned long addr)
-{
-        struct page *page = virt_to_page((void *)addr);
-        page->mapping = NULL;
-        __free_page(page);
-}
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
-        int i;
-        perf_mmap_free_page((unsigned long)buffer->user_page);
-        for (i = 0; i < buffer->nr_pages; i++)
-                perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
-        kfree(buffer);
-}
-static inline int page_order(struct perf_buffer *buffer)
-{
-        return 0;
-}
-#else
-/*
- * Back perf_mmap() with vmalloc memory.
- *
- * Required for architectures that have d-cache aliasing issues.
- */
-static inline int page_order(struct perf_buffer *buffer)
-{
-        return buffer->page_order;
-}
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
-        if (pgoff > (1UL << page_order(buffer)))
-                return NULL;
-        return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
-}
-static void perf_mmap_unmark_page(void *addr)
-{
-        struct page *page = vmalloc_to_page(addr);
-        page->mapping = NULL;
-}
-static void perf_buffer_free_work(struct work_struct *work)
-{
-        struct perf_buffer *buffer;
-        void *base;
-        int i, nr;
-        buffer = container_of(work, struct perf_buffer, work);
-        nr = 1 << page_order(buffer);
-        base = buffer->user_page;
-        for (i = 0; i < nr + 1; i++)
-                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
-        vfree(base);
-        kfree(buffer);
-}
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
-        schedule_work(&buffer->work);
-}
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
-        struct perf_buffer *buffer;
-        unsigned long size;
-        void *all_buf;
-        size = sizeof(struct perf_buffer);
-        size += sizeof(void *);
-        buffer = kzalloc(size, GFP_KERNEL);
-        if (!buffer)
-                goto fail;
-        INIT_WORK(&buffer->work, perf_buffer_free_work);
-        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
-        if (!all_buf)
-                goto fail_all_buf;
-        buffer->user_page = all_buf;
-        buffer->data_pages[0] = all_buf + PAGE_SIZE;
-        buffer->page_order = ilog2(nr_pages);
-        buffer->nr_pages = 1;
-        perf_buffer_init(buffer, watermark, flags);
-        return buffer;
-fail_all_buf:
-        kfree(buffer);
-fail:
-        return NULL;
-}
-#endif
-static unsigned long perf_data_size(struct perf_buffer *buffer)
-{
-        return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
-}
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        int ret = VM_FAULT_SIGBUS;
        if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3473,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        rb = rcu_dereference(event->rb);
-        if (!buffer)
+        if (!rb)
                goto unlock;
        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;
-        vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
+        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
        if (!vmf->page)
                goto unlock;
@@ -3645,35 +3495,35 @@ unlock:
        return ret;
 }
-static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
+static void rb_free_rcu(struct rcu_head *rcu_head)
 {
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
-        buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+        rb = container_of(rcu_head, struct ring_buffer, rcu_head);
-        perf_buffer_free(buffer);
+        rb_free(rb);
 }
-static struct perf_buffer *perf_buffer_get(struct perf_event *event)
+static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        rb = rcu_dereference(event->rb);
-        if (buffer) {
+        if (rb) {
-                if (!atomic_inc_not_zero(&buffer->refcount))
+                if (!atomic_inc_not_zero(&rb->refcount))
-                        buffer = NULL;
+                        rb = NULL;
        }
        rcu_read_unlock();
-        return buffer;
+        return rb;
 }
-static void perf_buffer_put(struct perf_buffer *buffer)
+static void ring_buffer_put(struct ring_buffer *rb)
 {
-        if (!atomic_dec_and_test(&buffer->refcount))
+        if (!atomic_dec_and_test(&rb->refcount))
                return;
-        call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
+        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3538,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-                unsigned long size = perf_data_size(event->buffer);
+                unsigned long size = perf_data_size(event->rb);
                struct user_struct *user = event->mmap_user;
-                struct perf_buffer *buffer = event->buffer;
+                struct ring_buffer *rb = event->rb;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
                vma->vm_mm->locked_vm -= event->mmap_locked;
-                rcu_assign_pointer(event->buffer, NULL);
+                rcu_assign_pointer(event->rb, NULL);
                mutex_unlock(&event->mmap_mutex);
-                perf_buffer_put(buffer);
+                ring_buffer_put(rb);
                free_uid(user);
        }
 }
@@ -3715,7 +3565,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        unsigned long locked, lock_limit;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra, extra;
@@ -3724,7 +3574,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
-         * same buffer.
+         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;
@@ -3736,7 +3586,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        nr_pages = (vma_size / PAGE_SIZE) - 1;
        /*
-         * If we have buffer pages ensure they're a power-of-two number, so we
+         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3600,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->mmap_mutex);
-        if (event->buffer) {
+        if (event->rb) {
-                if (event->buffer->nr_pages == nr_pages)
+                if (event->rb->nr_pages == nr_pages)
-                        atomic_inc(&event->buffer->refcount);
+                        atomic_inc(&event->rb->refcount);
                else
                        ret = -EINVAL;
                goto unlock;
@@ -3782,18 +3632,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                goto unlock;
        }
-        WARN_ON(event->buffer);
+        WARN_ON(event->rb);
        if (vma->vm_flags & VM_WRITE)
-                flags |= PERF_BUFFER_WRITABLE;
+                flags |= RING_BUFFER_WRITABLE;
+        rb = rb_alloc(nr_pages, 
+                event->attr.watermark ? event->attr.wakeup_watermark : 0,
+                event->cpu, flags);
-        buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+        if (!rb) {
-                                   event->cpu, flags);
-        if (!buffer) {
                ret = -ENOMEM;
                goto unlock;
        }
-        rcu_assign_pointer(event->buffer, buffer);
+        rcu_assign_pointer(event->rb, rb);
        atomic_long_add(user_extra, &user->locked_vm);
        event->mmap_locked = extra;
@@ -3892,117 +3744,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 }
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
-/*
- * Output
- */
-static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
-                              unsigned long offset, unsigned long head)
-{
-        unsigned long mask;
-        if (!buffer->writable)
-                return true;
-        mask = perf_data_size(buffer) - 1;
-        offset = (offset - tail) & mask;
-        head   = (head   - tail) & mask;
-        if ((int)(head - offset) < 0)
-                return false;
-        return true;
-}
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
-        atomic_set(&handle->buffer->poll, POLL_IN);
-        if (handle->nmi) {
-                handle->event->pending_wakeup = 1;
-                irq_work_queue(&handle->event->pending);
-        } else
-                perf_event_wakeup(handle->event);
-}
-/*
- * We need to ensure a later event_id doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_get_handle(struct perf_output_handle *handle)
-{
-        struct perf_buffer *buffer = handle->buffer;
-        preempt_disable();
-        local_inc(&buffer->nest);
-        handle->wakeup = local_read(&buffer->wakeup);
-}
-static void perf_output_put_handle(struct perf_output_handle *handle)
-{
-        struct perf_buffer *buffer = handle->buffer;
-        unsigned long head;
-again:
-        head = local_read(&buffer->head);
-        /*
-         * IRQ/NMI can happen here, which means we can miss a head update.
-         */
-        if (!local_dec_and_test(&buffer->nest))
-                goto out;
-        /*
-         * Publish the known good head. Rely on the full barrier implied
-         * by atomic_dec_and_test() order the buffer->head read and this
-         * write.
-         */
-        buffer->user_page->data_head = head;
-        /*
-         * Now check if we missed an update, rely on the (compiler)
-         * barrier in atomic_dec_and_test() to re-read buffer->head.
-         */
-        if (unlikely(head != local_read(&buffer->head))) {
-                local_inc(&buffer->nest);
-                goto again;
-        }
-        if (handle->wakeup != local_read(&buffer->wakeup))
-                perf_output_wakeup(handle);
-out:
-        preempt_enable();
-}
-__always_inline void perf_output_copy(struct perf_output_handle *handle,
-                      const void *buf, unsigned int len)
-{
-        do {
-                unsigned long size = min_t(unsigned long, handle->size, len);
-                memcpy(handle->addr, buf, size);
-                len -= size;
-                handle->addr += size;
-                buf += size;
-                handle->size -= size;
-                if (!handle->size) {
-                        struct perf_buffer *buffer = handle->buffer;
-                        handle->page++;
-                        handle->page &= buffer->nr_pages - 1;
-                        handle->addr = buffer->data_pages[handle->page];
-                        handle->size = PAGE_SIZE << page_order(buffer);
-                }
-        } while (len);
-}
 static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
@@ -4033,9 +3774,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
        }
 }
-static void perf_event_header__init_id(struct perf_event_header *header,
+void perf_event_header__init_id(struct perf_event_header *header,
-                                       struct perf_sample_data *data,
+                                struct perf_sample_data *data,
-                                       struct perf_event *event)
+                                struct perf_event *event)
 {
        if (event->attr.sample_id_all)
                __perf_event_header__init_id(header, data, event);
@@ -4062,121 +3803,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                perf_output_put(handle, data->cpu_entry);
 }
-static void perf_event__output_id_sample(struct perf_event *event,
+void perf_event__output_id_sample(struct perf_event *event,
-                                         struct perf_output_handle *handle,
+                                  struct perf_output_handle *handle,
-                                         struct perf_sample_data *sample)
+                                  struct perf_sample_data *sample)
 {
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
 }
-int perf_output_begin(struct perf_output_handle *handle,
-                      struct perf_event *event, unsigned int size,
-                      int nmi, int sample)
-{
-        struct perf_buffer *buffer;
-        unsigned long tail, offset, head;
-        int have_lost;
-        struct perf_sample_data sample_data;
-        struct {
-                struct perf_event_header header;
-                u64                      id;
-                u64                      lost;
-        } lost_event;
-        rcu_read_lock();
-        /*
-         * For inherited events we send all the output towards the parent.
-         */
-        if (event->parent)
-                event = event->parent;
-        buffer = rcu_dereference(event->buffer);
-        if (!buffer)
-                goto out;
-        handle->buffer  = buffer;
-        handle->event   = event;
-        handle->nmi     = nmi;
-        handle->sample  = sample;
-        if (!buffer->nr_pages)
-                goto out;
-        have_lost = local_read(&buffer->lost);
-        if (have_lost) {
-                lost_event.header.size = sizeof(lost_event);
-                perf_event_header__init_id(&lost_event.header, &sample_data,
-                                           event);
-                size += lost_event.header.size;
-        }
-        perf_output_get_handle(handle);
-        do {
-                /*
-                 * Userspace could choose to issue a mb() before updating the
-                 * tail pointer. So that all reads will be completed before the
-                 * write is issued.
-                 */
-                tail = ACCESS_ONCE(buffer->user_page->data_tail);
-                smp_rmb();
-                offset = head = local_read(&buffer->head);
-                head += size;
-                if (unlikely(!perf_output_space(buffer, tail, offset, head)))
-                        goto fail;
-        } while (local_cmpxchg(&buffer->head, offset, head) != offset);
-        if (head - local_read(&buffer->wakeup) > buffer->watermark)
-                local_add(buffer->watermark, &buffer->wakeup);
-        handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
-        handle->page &= buffer->nr_pages - 1;
-        handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
-        handle->addr = buffer->data_pages[handle->page];
-        handle->addr += handle->size;
-        handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
-        if (have_lost) {
-                lost_event.header.type = PERF_RECORD_LOST;
-                lost_event.header.misc = 0;
-                lost_event.id          = event->id;
-                lost_event.lost        = local_xchg(&buffer->lost, 0);
-                perf_output_put(handle, lost_event);
-                perf_event__output_id_sample(event, handle, &sample_data);
-        }
-        return 0;
-fail:
-        local_inc(&buffer->lost);
-        perf_output_put_handle(handle);
-out:
-        rcu_read_unlock();
-        return -ENOSPC;
-}
-void perf_output_end(struct perf_output_handle *handle)
-{
-        struct perf_event *event = handle->event;
-        struct perf_buffer *buffer = handle->buffer;
-        int wakeup_events = event->attr.wakeup_events;
-        if (handle->sample && wakeup_events) {
-                int events = local_inc_return(&buffer->events);
-                if (events >= wakeup_events) {
-                        local_sub(wakeup_events, &buffer->events);
-                        local_inc(&buffer->wakeup);
-                }
-        }
-        perf_output_put_handle(handle);
-        rcu_read_unlock();
-}
 static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
@@ -4197,7 +3831,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
-        perf_output_copy(handle, values, n * sizeof(u64));
+        __output_copy(handle, values, n * sizeof(u64));
 }
 /*
@@ -4227,7 +3861,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
-        perf_output_copy(handle, values, n * sizeof(u64));
+        __output_copy(handle, values, n * sizeof(u64));
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                n = 0;
@@ -4239,7 +3873,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
-                perf_output_copy(handle, values, n * sizeof(u64));
+                __output_copy(handle, values, n * sizeof(u64));
        }
 }
@@ -4249,7 +3883,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
-        u64 enabled = 0, running = 0, now, ctx_time;
+        u64 enabled = 0, running = 0;
        u64 read_format = event->attr.read_format;
        /*
@@ -4261,12 +3895,8 @@ static void perf_output_read(struct perf_output_handle *handle,
         * because of locking issue as we are called in
         * NMI context
         */
-        if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+        if (read_format & PERF_FORMAT_TOTAL_TIMES)
-                now = perf_clock();
+                calc_timer_values(event, &enabled, &running);
-                ctx_time = event->shadow_ctx_time + now;
-                enabled = ctx_time - event->tstamp_enabled;
-                running = ctx_time - event->tstamp_running;
-        }
        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3949,7 @@ void perf_output_sample(struct perf_output_handle *handle,
                        size *= sizeof(u64);
-                        perf_output_copy(handle, data->callchain, size);
+                        __output_copy(handle, data->callchain, size);
                } else {
                        u64 nr = 0;
                        perf_output_put(handle, nr);
@@ -4329,8 +3959,8 @@ void perf_output_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
                        perf_output_put(handle, data->raw->size);
-                        perf_output_copy(handle, data->raw->data,
+                        __output_copy(handle, data->raw->data,
-                                         data->raw->size);
+                                           data->raw->size);
                } else {
                        struct {
                                u32     size;
@@ -4342,6 +3972,20 @@ void perf_output_sample(struct perf_output_handle *handle,
                        perf_output_put(handle, raw);
                }
        }
+        if (!event->attr.watermark) {
+                int wakeup_events = event->attr.wakeup_events;
+                if (wakeup_events) {
+                        struct ring_buffer *rb = handle->rb;
+                        int events = local_inc_return(&rb->events);
+                        if (events >= wakeup_events) {
+                                local_sub(wakeup_events, &rb->events);
+                                local_inc(&rb->wakeup);
+                        }
+                }
+        }
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +4030,7 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
 }
-static void perf_event_output(struct perf_event *event, int nmi,
+static void perf_event_output(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
@@ -4398,7 +4042,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
        perf_prepare_sample(&header, data, event, regs);
-        if (perf_output_begin(&handle, event, header.size, nmi, 1))
+        if (perf_output_begin(&handle, event, header.size))
                goto exit;
        perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4082,7 @@ perf_event_read_event(struct perf_event *event,
        int ret;
        perf_event_header__init_id(&read_event.header, &sample, event);
-        ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+        ret = perf_output_begin(&handle, event, read_event.header.size);
        if (ret)
                return;
@@ -4481,7 +4125,7 @@ static void perf_event_task_output(struct perf_event *event,
        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                task_event->event_id.header.size, 0, 0);
+                                task_event->event_id.header.size);
        if (ret)
                goto out;
@@ -4618,7 +4262,7 @@ static void perf_event_comm_output(struct perf_event *event,
        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                comm_event->event_id.header.size, 0, 0);
+                                comm_event->event_id.header.size);
        if (ret)
                goto out;
@@ -4627,7 +4271,7 @@ static void perf_event_comm_output(struct perf_event *event,
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
        perf_output_put(&handle, comm_event->event_id);
-        perf_output_copy(&handle, comm_event->comm,
+        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);
        perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4409,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                mmap_event->event_id.header.size, 0, 0);
+                                mmap_event->event_id.header.size);
        if (ret)
                goto out;
@@ -4773,7 +4417,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        mmap_event->event_id.tid = perf_event_tid(event, current);
        perf_output_put(&handle, mmap_event->event_id);
-        perf_output_copy(&handle, mmap_event->file_name,
+        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);
        perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4473,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        if (file) {
                /*
-                 * d_path works from the end of the buffer backwards, so we
+                 * d_path works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
@@ -4960,7 +4604,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        perf_event_header__init_id(&throttle_event.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                throttle_event.header.size, 1, 0);
+                                throttle_event.header.size);
        if (ret)
                return;
@@ -4973,7 +4617,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 * Generic event overflow handling, sampling.
 */
-static int __perf_event_overflow(struct perf_event *event, int nmi,
+static int __perf_event_overflow(struct perf_event *event,
                                   int throttle, struct perf_sample_data *data,
                                   struct pt_regs *regs)
 {
@@ -5016,34 +4660,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
-                if (nmi) {
+                event->pending_disable = 1;
-                        event->pending_disable = 1;
+                irq_work_queue(&event->pending);
-                        irq_work_queue(&event->pending);
-                } else
-                        perf_event_disable(event);
        }
        if (event->overflow_handler)
-                event->overflow_handler(event, nmi, data, regs);
+                event->overflow_handler(event, data, regs);
        else
-                perf_event_output(event, nmi, data, regs);
+                perf_event_output(event, data, regs);
        if (event->fasync && event->pending_kill) {
-                if (nmi) {
+                event->pending_wakeup = 1;
-                        event->pending_wakeup = 1;
+                irq_work_queue(&event->pending);
-                        irq_work_queue(&event->pending);
-                } else
-                        perf_event_wakeup(event);
        }
        return ret;
 }
-int perf_event_overflow(struct perf_event *event, int nmi,
+int perf_event_overflow(struct perf_event *event,
                          struct perf_sample_data *data,
                          struct pt_regs *regs)
 {
-        return __perf_event_overflow(event, nmi, 1, data, regs);
+        return __perf_event_overflow(event, 1, data, regs);
 }
 /*
@@ -5092,7 +4730,7 @@ again:
 }
 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
-                                    int nmi, struct perf_sample_data *data,
+                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4744,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                return;
        for (; overflow; overflow--) {
-                if (__perf_event_overflow(event, nmi, throttle,
+                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
@@ -5119,7 +4757,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 }
 static void perf_swevent_event(struct perf_event *event, u64 nr,
-                               int nmi, struct perf_sample_data *data,
+                               struct perf_sample_data *data,
                               struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4771,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
                return;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
-                return perf_swevent_overflow(event, 1, nmi, data, regs);
+                return perf_swevent_overflow(event, 1, data, regs);
        if (local64_add_negative(nr, &hwc->period_left))
                return;
-        perf_swevent_overflow(event, 0, nmi, data, regs);
+        perf_swevent_overflow(event, 0, data, regs);
 }
 static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4864,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 }
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                    u64 nr, int nmi,
+                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
@@ -5242,7 +4880,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
-                        perf_swevent_event(event, nr, nmi, data, regs);
+                        perf_swevent_event(event, nr, data, regs);
        }
 end:
        rcu_read_unlock();
@@ -5263,8 +4901,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
        put_recursion_context(swhash->recursion, rctx);
 }
-void __perf_sw_event(u32 event_id, u64 nr, int nmi,
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
-                            struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data;
        int rctx;
@@ -5276,7 +4913,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        perf_sample_data_init(&data, addr);
-        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
        perf_swevent_put_recursion_context(rctx);
        preempt_enable_notrace();
@@ -5524,7 +5161,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
-                        perf_swevent_event(event, count, 1, &data, regs);
+                        perf_swevent_event(event, count, &data, regs);
        }
        perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5254,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        perf_sample_data_init(&sample, bp->attr.bp_addr);
        if (!bp->hw.state && !perf_exclude_event(bp, regs))
-                perf_swevent_event(bp, 1, 1, &sample, regs);
+                perf_swevent_event(bp, 1, &sample, regs);
 }
 #endif
@@ -5646,7 +5283,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && current->pid == 0))
-                        if (perf_event_overflow(event, 0, &data, regs))
+                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5986,6 +5623,7 @@ free_dev:
 }
 static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
 int perf_pmu_register(struct pmu *pmu, char *name, int type)
 {
@@ -6036,6 +5674,7 @@ skip_type:
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
                cpuctx->jiffies_interval = 1;
@@ -6150,7 +5789,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
-                 perf_overflow_handler_t overflow_handler)
+                 perf_overflow_handler_t overflow_handler,
+                 void *context)
 {
        struct pmu *pmu;
        struct perf_event *event;
@@ -6208,10 +5848,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 #endif
        }
-        if (!overflow_handler && parent_event)
+        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
+                context = parent_event->overflow_handler_context;
+        }
        event->overflow_handler = overflow_handler;
+        event->overflow_handler_context = context;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -6326,13 +5969,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (ret)
                return -EFAULT;
-        /*
-         * If the type exists, the corresponding creation will verify
-         * the attr->config.
-         */
-        if (attr->type >= PERF_TYPE_MAX)
-                return -EINVAL;
        if (attr->__reserved_1)
                return -EINVAL;
@@ -6354,7 +5990,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-        struct perf_buffer *buffer = NULL, *old_buffer = NULL;
+        struct ring_buffer *rb = NULL, *old_rb = NULL;
        int ret = -EINVAL;
        if (!output_event)
@@ -6371,7 +6007,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
                goto out;
        /*
-         * If its not a per-cpu buffer, it must be the same task.
+         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
                goto out;
@@ -6383,20 +6019,20 @@ set:
                goto unlock;
        if (output_event) {
-                /* get the buffer we want to redirect to */
+                /* get the rb we want to redirect to */
-                buffer = perf_buffer_get(output_event);
+                rb = ring_buffer_get(output_event);
-                if (!buffer)
+                if (!rb)
                        goto unlock;
        }
-        old_buffer = event->buffer;
+        old_rb = event->rb;
-        rcu_assign_pointer(event->buffer, buffer);
+        rcu_assign_pointer(event->rb, rb);
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
-        if (old_buffer)
+        if (old_rb)
-                perf_buffer_put(old_buffer);
+                ring_buffer_put(old_rb);
 out:
        return ret;
 }
@@ -6478,7 +6114,8 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
-        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+                                 NULL, NULL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
@@ -6663,7 +6300,8 @@ err_fd:
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
-                                 perf_overflow_handler_t overflow_handler)
+                                 perf_overflow_handler_t overflow_handler,
+                                 void *context)
 {
        struct perf_event_context *ctx;
        struct perf_event *event;
@@ -6673,7 +6311,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         * Get the target context (task or percpu):
         */
-        event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
+        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+                                 overflow_handler, context);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
@@ -6780,7 +6419,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         * our context.
         */
        child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
-        task_ctx_sched_out(child_ctx, EVENT_ALL);
        /*
         * Take the context lock here so that if find_get_context is
@@ -6788,6 +6426,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         * incremented the context's refcount before we do put_ctx below.
         */
        raw_spin_lock(&child_ctx->lock);
+        task_ctx_sched_out(child_ctx);
        child->perf_event_ctxp[ctxn] = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
@@ -6957,7 +6596,7 @@ inherit_event(struct perf_event *parent_event,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
-                                           NULL);
+                                           NULL, NULL);
        if (IS_ERR(child_event))
                return child_event;
        get_ctx(child_ctx);
@@ -6984,6 +6623,8 @@ inherit_event(struct perf_event *parent_event,
        child_event->ctx = child_ctx;
        child_event->overflow_handler = parent_event->overflow_handler;
+        child_event->overflow_handler_context
+                = parent_event->overflow_handler_context;
        /*
         * Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55..b7971d6f38b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
+                            void *context,
                            struct task_struct *tsk)
 {
-        return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
+        return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+                                                context);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 */
 struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_overflow_handler_t triggered)
+                            perf_overflow_handler_t triggered,
+                            void *context)
 {
        struct perf_event * __percpu *cpu_events, **pevent, *bp;
        long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-                bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
+                bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+                                                      triggered, context);
                *pevent = bp;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 00000000000..09097dd8116
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
+#ifndef _KERNEL_EVENTS_INTERNAL_H
+#define _KERNEL_EVENTS_INTERNAL_H
+#define RING_BUFFER_WRITABLE            0x01
+struct ring_buffer {
+        atomic_t                        refcount;
+        struct rcu_head                 rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+        struct work_struct              work;
+        int                             page_order;     /* allocation order  */
+#endif
+        int                             nr_pages;       /* nr of data pages  */
+        int                             writable;       /* are we writable   */
+        atomic_t                        poll;           /* POLL_ for wakeups */
+        local_t                         head;           /* write position    */
+        local_t                         nest;           /* nested writers    */
+        local_t                         events;         /* event limit       */
+        local_t                         wakeup;         /* wakeup stamp      */
+        local_t                         lost;           /* nr records lost   */
+        long                            watermark;      /* wakeup watermark  */
+        struct perf_event_mmap_page     *user_page;
+        void                            *data_pages[0];
+};
+extern void rb_free(struct ring_buffer *rb);
+extern struct ring_buffer *
+rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_event_wakeup(struct perf_event *event);
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+                           struct perf_sample_data *data,
+                           struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+                             struct perf_output_handle *handle,
+                             struct perf_sample_data *sample);
+extern struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+#ifdef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+static inline int page_order(struct ring_buffer *rb)
+{
+        return rb->page_order;
+}
+#else
+static inline int page_order(struct ring_buffer *rb)
+{
+        return 0;
+}
+#endif
+static unsigned long perf_data_size(struct ring_buffer *rb)
+{
+        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
+}
+static inline void
+__output_copy(struct perf_output_handle *handle,
+                   const void *buf, unsigned int len)
+{
+        do {
+                unsigned long size = min_t(unsigned long, handle->size, len);
+                memcpy(handle->addr, buf, size);
+                len -= size;
+                handle->addr += size;
+                buf += size;
+                handle->size -= size;
+                if (!handle->size) {
+                        struct ring_buffer *rb = handle->rb;
+                        handle->page++;
+                        handle->page &= rb->nr_pages - 1;
+                        handle->addr = rb->data_pages[handle->page];
+                        handle->size = PAGE_SIZE << page_order(rb);
+                }
+        } while (len);
+}
+#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 00000000000..a2a29205cc0
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
+/*
+ * Performance events ring-buffer code:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+#include <linux/perf_event.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include "internal.h"
+static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
+                              unsigned long offset, unsigned long head)
+{
+        unsigned long mask;
+        if (!rb->writable)
+                return true;
+        mask = perf_data_size(rb) - 1;
+        offset = (offset - tail) & mask;
+        head   = (head   - tail) & mask;
+        if ((int)(head - offset) < 0)
+                return false;
+        return true;
+}
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+        atomic_set(&handle->rb->poll, POLL_IN);
+        handle->event->pending_wakeup = 1;
+        irq_work_queue(&handle->event->pending);
+}
+/*
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_get_handle(struct perf_output_handle *handle)
+{
+        struct ring_buffer *rb = handle->rb;
+        preempt_disable();
+        local_inc(&rb->nest);
+        handle->wakeup = local_read(&rb->wakeup);
+}
+static void perf_output_put_handle(struct perf_output_handle *handle)
+{
+        struct ring_buffer *rb = handle->rb;
+        unsigned long head;
+again:
+        head = local_read(&rb->head);
+        /*
+         * IRQ/NMI can happen here, which means we can miss a head update.
+         */
+        if (!local_dec_and_test(&rb->nest))
+                goto out;
+        /*
+         * Publish the known good head. Rely on the full barrier implied
+         * by atomic_dec_and_test() order the rb->head read and this
+         * write.
+         */
+        rb->user_page->data_head = head;
+        /*
+         * Now check if we missed an update, rely on the (compiler)
+         * barrier in atomic_dec_and_test() to re-read rb->head.
+         */
+        if (unlikely(head != local_read(&rb->head))) {
+                local_inc(&rb->nest);
+                goto again;
+        }
+        if (handle->wakeup != local_read(&rb->wakeup))
+                perf_output_wakeup(handle);
+out:
+        preempt_enable();
+}
+int perf_output_begin(struct perf_output_handle *handle,
+                      struct perf_event *event, unsigned int size)
+{
+        struct ring_buffer *rb;
+        unsigned long tail, offset, head;
+        int have_lost;
+        struct perf_sample_data sample_data;
+        struct {
+                struct perf_event_header header;
+                u64                      id;
+                u64                      lost;
+        } lost_event;
+        rcu_read_lock();
+        /*
+         * For inherited events we send all the output towards the parent.
+         */
+        if (event->parent)
+                event = event->parent;
+        rb = rcu_dereference(event->rb);
+        if (!rb)
+                goto out;
+        handle->rb      = rb;
+        handle->event   = event;
+        if (!rb->nr_pages)
+                goto out;
+        have_lost = local_read(&rb->lost);
+        if (have_lost) {
+                lost_event.header.size = sizeof(lost_event);
+                perf_event_header__init_id(&lost_event.header, &sample_data,
+                                           event);
+                size += lost_event.header.size;
+        }
+        perf_output_get_handle(handle);
+        do {
+                /*
+                 * Userspace could choose to issue a mb() before updating the
+                 * tail pointer. So that all reads will be completed before the
+                 * write is issued.
+                 */
+                tail = ACCESS_ONCE(rb->user_page->data_tail);
+                smp_rmb();
+                offset = head = local_read(&rb->head);
+                head += size;
+                if (unlikely(!perf_output_space(rb, tail, offset, head)))
+                        goto fail;
+        } while (local_cmpxchg(&rb->head, offset, head) != offset);
+        if (head - local_read(&rb->wakeup) > rb->watermark)
+                local_add(rb->watermark, &rb->wakeup);
+        handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+        handle->page &= rb->nr_pages - 1;
+        handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
+        handle->addr = rb->data_pages[handle->page];
+        handle->addr += handle->size;
+        handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+        if (have_lost) {
+                lost_event.header.type = PERF_RECORD_LOST;
+                lost_event.header.misc = 0;
+                lost_event.id          = event->id;
+                lost_event.lost        = local_xchg(&rb->lost, 0);
+                perf_output_put(handle, lost_event);
+                perf_event__output_id_sample(event, handle, &sample_data);
+        }
+        return 0;
+fail:
+        local_inc(&rb->lost);
+        perf_output_put_handle(handle);
+out:
+        rcu_read_unlock();
+        return -ENOSPC;
+}
+void perf_output_copy(struct perf_output_handle *handle,
+                      const void *buf, unsigned int len)
+{
+        __output_copy(handle, buf, len);
+}
+void perf_output_end(struct perf_output_handle *handle)
+{
+        perf_output_put_handle(handle);
+        rcu_read_unlock();
+}
+static void
+ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+{
+        long max_size = perf_data_size(rb);
+        if (watermark)
+                rb->watermark = min(max_size, watermark);
+        if (!rb->watermark)
+                rb->watermark = max_size / 2;
+        if (flags & RING_BUFFER_WRITABLE)
+                rb->writable = 1;
+        atomic_set(&rb->refcount, 1);
+}
+#ifndef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+        if (pgoff > rb->nr_pages)
+                return NULL;
+        if (pgoff == 0)
+                return virt_to_page(rb->user_page);
+        return virt_to_page(rb->data_pages[pgoff - 1]);
+}
+static void *perf_mmap_alloc_page(int cpu)
+{
+        struct page *page;
+        int node;
+        node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+        if (!page)
+                return NULL;
+        return page_address(page);
+}
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+        struct ring_buffer *rb;
+        unsigned long size;
+        int i;
+        size = sizeof(struct ring_buffer);
+        size += nr_pages * sizeof(void *);
+        rb = kzalloc(size, GFP_KERNEL);
+        if (!rb)
+                goto fail;
+        rb->user_page = perf_mmap_alloc_page(cpu);
+        if (!rb->user_page)
+                goto fail_user_page;
+        for (i = 0; i < nr_pages; i++) {
+                rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+                if (!rb->data_pages[i])
+                        goto fail_data_pages;
+        }
+        rb->nr_pages = nr_pages;
+        ring_buffer_init(rb, watermark, flags);
+        return rb;
+fail_data_pages:
+        for (i--; i >= 0; i--)
+                free_page((unsigned long)rb->data_pages[i]);
+        free_page((unsigned long)rb->user_page);
+fail_user_page:
+        kfree(rb);
+fail:
+        return NULL;
+}
+static void perf_mmap_free_page(unsigned long addr)
+{
+        struct page *page = virt_to_page((void *)addr);
+        page->mapping = NULL;
+        __free_page(page);
+}
+void rb_free(struct ring_buffer *rb)
+{
+        int i;
+        perf_mmap_free_page((unsigned long)rb->user_page);
+        for (i = 0; i < rb->nr_pages; i++)
+                perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+        kfree(rb);
+}
+#else
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+        if (pgoff > (1UL << page_order(rb)))
+                return NULL;
+        return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
+}
+static void perf_mmap_unmark_page(void *addr)
+{
+        struct page *page = vmalloc_to_page(addr);
+        page->mapping = NULL;
+}
+static void rb_free_work(struct work_struct *work)
+{
+        struct ring_buffer *rb;
+        void *base;
+        int i, nr;
+        rb = container_of(work, struct ring_buffer, work);
+        nr = 1 << page_order(rb);
+        base = rb->user_page;
+        for (i = 0; i < nr + 1; i++)
+                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+        vfree(base);
+        kfree(rb);
+}
+void rb_free(struct ring_buffer *rb)
+{
+        schedule_work(&rb->work);
+}
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+        struct ring_buffer *rb;
+        unsigned long size;
+        void *all_buf;
+        size = sizeof(struct ring_buffer);
+        size += sizeof(void *);
+        rb = kzalloc(size, GFP_KERNEL);
+        if (!rb)
+                goto fail;
+        INIT_WORK(&rb->work, rb_free_work);
+        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+        if (!all_buf)
+                goto fail_all_buf;
+        rb->user_page = all_buf;
+        rb->data_pages[0] = all_buf + PAGE_SIZE;
+        rb->page_order = ilog2(nr_pages);
+        rb->nr_pages = 1;
+        ring_buffer_init(rb, watermark, flags);
+        return rb;
+fail_all_buf:
+        kfree(rb);
+fail:
+        return NULL;
+}
+#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 64879bdff92..9d13da8a8c2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,6 @@ static void __exit_signal(struct task_struct *tsk)
        struct tty_struct *uninitialized_var(tty);
        sighand = rcu_dereference_check(tsk->sighand,
-                                        rcu_read_lock_held() ||
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);
@@ -171,7 +170,6 @@ void release_task(struct task_struct * p)
        struct task_struct *leader;
        int zap_leader;
 repeat:
-        tracehook_prepare_release_task(p);
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
@@ -181,7 +179,7 @@ repeat:
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
-        tracehook_finish_release_task(p);
+        ptrace_release_task(p);
        __exit_signal(p);
        /*
@@ -192,22 +190,12 @@ repeat:
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-                BUG_ON(task_detached(leader));
-                do_notify_parent(leader, leader->exit_signal);
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
-                 *
-                 * do_notify_parent() will have marked it self-reaping in
-                 * that case.
-                 */
-                zap_leader = task_detached(leader);
-                /*
-                 * This maintains the invariant that release_task()
-                 * only runs on a task in EXIT_DEAD, just for sanity.
                 */
+                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }
@@ -279,18 +267,16 @@ int is_current_pgrp_orphaned(void)
        return retval;
 }
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
 {
-        int retval = 0;
        struct task_struct *p;
        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
-                if (!task_is_stopped(p))
+                if (p->signal->flags & SIGNAL_STOP_STOPPED)
-                        continue;
+                        return true;
-                retval = 1;
-                break;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-        return retval;
+        return false;
 }
 /*
@@ -753,7 +739,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 {
        list_move_tail(&p->sibling, &p->real_parent->children);
-        if (task_detached(p))
+        if (p->exit_state == EXIT_DEAD)
                return;
        /*
         * If this is a threaded reparent there is no need to
@@ -766,10 +752,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
        p->exit_signal = SIGCHLD;
        /* If it has exited notify the new parent about this child's death. */
-        if (!task_ptrace(p) &&
+        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-                do_notify_parent(p, p->exit_signal);
+                if (do_notify_parent(p, p->exit_signal)) {
-                if (task_detached(p)) {
                        p->exit_state = EXIT_DEAD;
                        list_move_tail(&p->sibling, dead);
                }
@@ -796,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
                do {
                        t->real_parent = reaper;
                        if (t->parent == father) {
-                                BUG_ON(task_ptrace(t));
+                                BUG_ON(t->ptrace);
                                t->parent = t->real_parent;
                        }
                        if (t->pdeath_signal)
@@ -821,8 +806,7 @@ static void forget_original_parent(struct task_struct *father)
 */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-        int signal;
+        bool autoreap;
-        void *cookie;
        /*
         * This does two things:
@@ -853,26 +837,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
         * we have changed execution domain as these two values started
         * the same after a fork.
         */
-        if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+        if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
             tsk->self_exec_id != tsk->parent_exec_id))
                tsk->exit_signal = SIGCHLD;
-        signal = tracehook_notify_death(tsk, &cookie, group_dead);
+        if (unlikely(tsk->ptrace)) {
-        if (signal >= 0)
+                int sig = thread_group_leader(tsk) &&
-                signal = do_notify_parent(tsk, signal);
+                                thread_group_empty(tsk) &&
+                                !ptrace_reparented(tsk) ?
+                        tsk->exit_signal : SIGCHLD;
+                autoreap = do_notify_parent(tsk, sig);
+        } else if (thread_group_leader(tsk)) {
+                autoreap = thread_group_empty(tsk) &&
+                        do_notify_parent(tsk, tsk->exit_signal);
+        } else {
+                autoreap = true;
+        }
-        tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+        tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
-        tracehook_report_death(tsk, signal, cookie, group_dead);
        /* If the process is dead, release it - nobody will wait for it */
-        if (signal == DEATH_REAP)
+        if (autoreap)
                release_task(tsk);
 }
@@ -908,7 +899,6 @@ NORET_TYPE void do_exit(long code)
        profile_task_exit(tsk);
-        WARN_ON(atomic_read(&tsk->fs_excl));
        WARN_ON(blk_needs_flush_plug(tsk));
        if (unlikely(in_interrupt()))
@@ -925,7 +915,7 @@ NORET_TYPE void do_exit(long code)
         */
        set_fs(USER_DS);
-        tracehook_report_exit(&code);
+        ptrace_event(PTRACE_EVENT_EXIT, code);
        validate_creds_for_do_exit(tsk);
@@ -994,6 +984,7 @@ NORET_TYPE void do_exit(long code)
        trace_sched_process_exit(tsk);
        exit_sem(tsk);
+        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        check_stack_usage();
@@ -1239,9 +1230,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        traced = ptrace_reparented(p);
        /*
         * It can be ptraced but not reparented, check
-         * !task_detached() to filter out sub-threads.
+         * thread_group_leader() to filter out sub-threads.
         */
-        if (likely(!traced) && likely(!task_detached(p))) {
+        if (likely(!traced) && thread_group_leader(p)) {
                struct signal_struct *psig;
                struct signal_struct *sig;
                unsigned long maxrss;
@@ -1349,16 +1340,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);
                /*
-                 * If this is not a detached task, notify the parent.
+                 * If this is not a sub-thread, notify the parent.
-                 * If it's still not detached after that, don't release
+                 * If parent wants a zombie, don't release it now.
-                 * it now.
                 */
-                if (!task_detached(p)) {
+                if (thread_group_leader(p) &&
-                        do_notify_parent(p, p->exit_signal);
+                    !do_notify_parent(p, p->exit_signal)) {
-                        if (!task_detached(p)) {
+                        p->exit_state = EXIT_ZOMBIE;
-                                p->exit_state = EXIT_ZOMBIE;
+                        p = NULL;
-                                p = NULL;
-                        }
                }
                write_unlock_irq(&tasklist_lock);
        }
@@ -1371,7 +1359,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
        if (ptrace) {
-                if (task_is_stopped_or_traced(p))
+                if (task_is_stopped_or_traced(p) &&
+                    !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1557,8 +1546,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
        }
        /* dead body doesn't have much to contribute */
-        if (p->exit_state == EXIT_DEAD)
+        if (unlikely(p->exit_state == EXIT_DEAD)) {
+                /*
+                 * But do not ignore this task until the tracer does
+                 * wait_task_zombie()->do_notify_parent().
+                 */
+                if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+                        wo->notask_error = 0;
                return 0;
+        }
        /* slay zombie? */
        if (p->exit_state == EXIT_ZOMBIE) {
@@ -1567,7 +1563,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                 * Notification and reaping will be cascaded to the real
                 * parent when the ptracer detaches.
                 */
-                if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+                if (likely(!ptrace) && unlikely(p->ptrace)) {
                        /* it will become visible, clear notask_error */
                        wo->notask_error = 0;
                        return 0;
@@ -1610,8 +1606,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                 * own children, it should create a separate process which
                 * takes the role of real parent.
                 */
-                if (likely(!ptrace) && task_ptrace(p) &&
+                if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
-                    same_thread_group(p->parent, p->real_parent))
                        return 0;
                /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 25c6111fe3a..067992d4838 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
-#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/kthread.h>
@@ -84,7 +83,7 @@
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
 unsigned long total_forks;      /* Handle normal Linux uptimes. */
-int nr_threads;                 /* The idle threads do not count.. */
+int nr_threads;                 /* The idle threads do not count.. */
 int max_threads;                /* tunable limit on nr_threads */
@@ -157,6 +156,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
+/* Notifier list called when a task struct is freed */
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static void account_kernel_stack(struct thread_info *ti, int account)
 {
        struct zone *zone = page_zone(virt_to_page(ti));
@@ -188,6 +190,18 @@ static inline void put_signal_struct(struct signal_struct *sig)
                free_signal_struct(sig);
 }
+int task_free_register(struct notifier_block *n)
+{
+        return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_register);
+int task_free_unregister(struct notifier_block *n)
+{
+        return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_unregister);
 void __put_task_struct(struct task_struct *tsk)
 {
        WARN_ON(!tsk->exit_state);
@@ -199,6 +213,7 @@ void __put_task_struct(struct task_struct *tsk)
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
+        atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
        if (!profile_handoff_task(tsk))
                free_task(tsk);
 }
@@ -237,7 +252,7 @@ void __init fork_init(unsigned long mempages)
        /*
         * we need to allow at least 20 threads to boot a system
         */
-        if(max_threads < 20)
+        if (max_threads < 20)
                max_threads = 20;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -273,7 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                return NULL;
        }
-        err = arch_dup_task_struct(tsk, orig);
+        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto out;
@@ -296,9 +311,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack_canary = get_random_int();
 #endif
-        /* One for us, one for whoever does the "release_task()" (usually parent) */
+        /*
-        atomic_set(&tsk->usage,2);
+         * One for us, one for whoever does the "release_task()" (usually
-        atomic_set(&tsk->fs_excl, 0);
+         * parent)
+         */
+        atomic_set(&tsk->usage, 2);
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
 #endif
@@ -446,7 +463,7 @@ fail_nomem:
        goto out;
 }
-static inline int mm_alloc_pgd(struct mm_struct * mm)
+static inline int mm_alloc_pgd(struct mm_struct *mm)
 {
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
@@ -454,7 +471,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
        return 0;
 }
-static inline void mm_free_pgd(struct mm_struct * mm)
+static inline void mm_free_pgd(struct mm_struct *mm)
 {
        pgd_free(mm, mm->pgd);
 }
@@ -491,7 +508,7 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
-static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
@@ -522,9 +539,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 /*
 * Allocate and initialize an mm_struct.
 */
-struct mm_struct * mm_alloc(void)
+struct mm_struct *mm_alloc(void)
 {
-        struct mm_struct * mm;
+        struct mm_struct *mm;
        mm = allocate_mm();
        if (!mm)
@@ -592,7 +609,7 @@ void added_exe_file_vma(struct mm_struct *mm)
 void removed_exe_file_vma(struct mm_struct *mm)
 {
        mm->num_exe_file_vmas--;
-        if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+        if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
                fput(mm->exe_file);
                mm->exe_file = NULL;
        }
@@ -784,9 +801,9 @@ fail_nocontext:
        return NULL;
 }
-static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 {
-        struct mm_struct * mm, *oldmm;
+        struct mm_struct *mm, *oldmm;
        int retval;
        tsk->min_flt = tsk->maj_flt = 0;
@@ -853,7 +870,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
        return 0;
 }
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct files_struct *oldf, *newf;
        int error = 0;
@@ -1020,7 +1037,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+        plist_head_init(&p->pi_waiters);
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -1117,6 +1134,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
        }
+        current->flags &= ~PF_NPROC_EXCEEDED;
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
@@ -1175,13 +1193,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
-        if (IS_ERR(p->mempolicy)) {
+        if (IS_ERR(p->mempolicy)) {
-                retval = PTR_ERR(p->mempolicy);
+                retval = PTR_ERR(p->mempolicy);
-                p->mempolicy = NULL;
+                p->mempolicy = NULL;
-                goto bad_fork_cleanup_cgroup;
+                goto bad_fork_cleanup_cgroup;
-        }
+        }
        mpol_fix_fork_child_flag(p);
 #endif
+#ifdef CONFIG_CPUSETS
+        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1221,25 +1243,33 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = perf_event_init_task(p);
        if (retval)
                goto bad_fork_cleanup_policy;
+        retval = audit_alloc(p);
-        if ((retval = audit_alloc(p)))
+        if (retval)
                goto bad_fork_cleanup_policy;
        /* copy all the process information */
-        if ((retval = copy_semundo(clone_flags, p)))
+        retval = copy_semundo(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_audit;
-        if ((retval = copy_files(clone_flags, p)))
+        retval = copy_files(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_semundo;
-        if ((retval = copy_fs(clone_flags, p)))
+        retval = copy_fs(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_files;
-        if ((retval = copy_sighand(clone_flags, p)))
+        retval = copy_sighand(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_fs;
-        if ((retval = copy_signal(clone_flags, p)))
+        retval = copy_signal(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_sighand;
-        if ((retval = copy_mm(clone_flags, p)))
+        retval = copy_mm(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_signal;
-        if ((retval = copy_namespaces(clone_flags, p)))
+        retval = copy_namespaces(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_mm;
-        if ((retval = copy_io(clone_flags, p)))
+        retval = copy_io(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
        if (retval)
@@ -1261,7 +1291,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /*
         * Clear TID on mm_release()?
         */
-        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
 #ifdef CONFIG_BLOCK
        p->plug = NULL;
 #endif
@@ -1329,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * it's process group.
         * A fatal signal pending means that current will exit, so the new
         * thread can't slip out of an OOM kill (or normal SIGKILL).
-         */
+        */
        recalc_sigpending();
        if (signal_pending(current)) {
                spin_unlock(&current->sighand->siglock);
@@ -1347,7 +1377,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
        if (likely(p->pid)) {
-                tracehook_finish_clone(p, clone_flags, trace);
+                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
                if (thread_group_leader(p)) {
                        if (is_child_reaper(pid))
@@ -1488,10 +1518,22 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * When called from kernel_thread, don't do user tracing stuff.
+         * Determine whether and which event to report to ptracer.  When
+         * called from kernel_thread or CLONE_UNTRACED is explicitly
+         * requested, no event is reported; otherwise, report if the event
+         * for the type of forking is enabled.
         */
-        if (likely(user_mode(regs)))
+        if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
-                trace = tracehook_prepare_clone(clone_flags);
+                if (clone_flags & CLONE_VFORK)
+                        trace = PTRACE_EVENT_VFORK;
+                else if ((clone_flags & CSIGNAL) != SIGCHLD)
+                        trace = PTRACE_EVENT_CLONE;
+                else
+                        trace = PTRACE_EVENT_FORK;
+                if (likely(!ptrace_event_enabled(current, trace)))
+                        trace = 0;
+        }
        p = copy_process(clone_flags, stack_start, regs, stack_size,
                         child_tidptr, NULL, trace);
@@ -1515,26 +1557,26 @@ long do_fork(unsigned long clone_flags,
                }
                audit_finish_fork(p);
-                tracehook_report_clone(regs, clone_flags, nr, p);
                /*
                 * We set PF_STARTING at creation in case tracing wants to
                 * use this to distinguish a fully live task from one that
-                 * hasn't gotten to tracehook_report_clone() yet.  Now we
+                 * hasn't finished SIGSTOP raising yet.  Now we clear it
-                 * clear it and set the child going.
+                 * and set the child going.
                 */
                p->flags &= ~PF_STARTING;
                wake_up_new_task(p);
-                tracehook_report_clone_complete(trace, regs,
+                /* forking complete and child started to run, tell ptracer */
-                                                clone_flags, nr, p);
+                if (unlikely(trace))
+                        ptrace_event(trace, nr);
                if (clone_flags & CLONE_VFORK) {
                        freezer_do_not_count();
                        wait_for_completion(&vfork);
                        freezer_count();
-                        tracehook_report_vfork_done(p, nr);
+                        ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                }
        } else {
                nr = PTR_ERR(p);
@@ -1581,6 +1623,7 @@ void __init proc_caches_init(void)
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
+        nsproxy_cache_init();
 }
 /*
@@ -1677,12 +1720,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
-        if ((err = unshare_fs(unshare_flags, &new_fs)))
+        err = unshare_fs(unshare_flags, &new_fs);
+        if (err)
                goto bad_unshare_out;
-        if ((err = unshare_fd(unshare_flags, &new_fd)))
+        err = unshare_fd(unshare_flags, &new_fd);
+        if (err)
                goto bad_unshare_cleanup_fs;
-        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
-                        new_fs)))
+        if (err)
                goto bad_unshare_cleanup_fd;
        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282ea..e6160fa842e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key)
 * @uaddr:      virtual address of the futex
 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:        address where result is stored.
+ * @rw:         mapping needs to be read/write (values: VERIFY_READ,
+ *              VERIFY_WRITE)
 *
 * Returns a negative error code or 0
 * The key words are stored in *key on success.
@@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key)
 * lock_page() might sleep, the caller should not hold a spinlock.
 */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
        struct page *page, *page_head;
-        int err;
+        int err, ro = 0;
        /*
         * The futex address must be "naturally" aligned.
@@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 again:
        err = get_user_pages_fast(address, 1, 1, &page);
+        /*
+         * If write access is not required (eg. FUTEX_WAIT), try
+         * and get read-only access.
+         */
+        if (err == -EFAULT && rw == VERIFY_READ) {
+                err = get_user_pages_fast(address, 1, 0, &page);
+                ro = 1;
+        }
        if (err < 0)
                return err;
+        else
+                err = 0;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        page_head = page;
@@ -302,10 +314,29 @@ again:
 #endif
        lock_page(page_head);
+        /*
+         * If page_head->mapping is NULL, then it cannot be a PageAnon
+         * page; but it might be the ZERO_PAGE or in the gate area or
+         * in a special mapping (all cases which we are happy to fail);
+         * or it may have been a good file page when get_user_pages_fast
+         * found it, but truncated or holepunched or subjected to
+         * invalidate_complete_page2 before we got the page lock (also
+         * cases which we are happy to fail).  And we hold a reference,
+         * so refcount care in invalidate_complete_page's remove_mapping
+         * prevents drop_caches from setting mapping to NULL beneath us.
+         *
+         * The case we do have to guard against is when memory pressure made
+         * shmem_writepage move it from filecache to swapcache beneath us:
+         * an unlikely race, but we do need to retry for page_head->mapping.
+         */
        if (!page_head->mapping) {
+                int shmem_swizzled = PageSwapCache(page_head);
                unlock_page(page_head);
                put_page(page_head);
-                goto again;
+                if (shmem_swizzled)
+                        goto again;
+                return -EFAULT;
        }
        /*
@@ -316,6 +347,15 @@ again:
         * the object not the particular process.
         */
        if (PageAnon(page_head)) {
+                /*
+                 * A RO anonymous page will never change and thus doesn't make
+                 * sense for futex operations.
+                 */
+                if (ro) {
+                        err = -EFAULT;
+                        goto out;
+                }
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
@@ -327,9 +367,10 @@ again:
        get_futex_key_refs(key);
+out:
        unlock_page(page_head);
        put_page(page_head);
-        return 0;
+        return err;
 }
 static inline void put_futex_key(union futex_key *key)
@@ -355,8 +396,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
        int ret;
        down_read(&mm->mmap_sem);
-        ret = get_user_pages(current, mm, (unsigned long)uaddr,
+        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-                             1, 1, 0, NULL, NULL);
+                               FAULT_FLAG_WRITE);
        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
@@ -940,7 +981,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
@@ -986,10 +1027,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1243,10 +1284,11 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1790,7 +1832,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
         * while the syscall executes.
         */
 retry:
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
        if (unlikely(ret != 0))
                return ret;
@@ -1941,7 +1983,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
        }
 retry:
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
@@ -2060,7 +2102,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != vpid)
                return -EPERM;
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
@@ -2249,7 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
-        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
@@ -2697,7 +2739,7 @@ static int __init futex_init(void)
                futex_cmpxchg_enabled = 1;
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-                plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 5bf924d80b5..824b741925b 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
        bool "Enable gcov-based kernel profiling"
        depends on DEBUG_FS
-        select CONSTRUCTORS
+        select CONSTRUCTORS if !UML
        default n
        ---help---
        This option enables gcov-based code profiling (e.g. for code coverage
@@ -35,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM
        default n
        ---help---
        This options activates profiling for the entire kernel.
@@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL
        larger and run slower. Also be sure to exclude files from profiling
        which are not linked to the kernel image to prevent linker errors.
+config GCOV_CTORS
+        string
+        depends on CONSTRUCTORS
+        default ".init_array" if ARM && AEABI
+        default ".ctors"
 endmenu
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb426003..d753d1152b7 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter)
 }
 /* Mapping of logical record number to actual file content. */
-#define RECORD_FILE_MAGIC       0
+#define RECORD_FILE_MAGIC               0
-#define RECORD_GCOV_VERSION     1
+#define RECORD_GCOV_VERSION             1
-#define RECORD_TIME_STAMP       2
+#define RECORD_TIME_STAMP               2
-#define RECORD_FUNCTION_TAG     3
+#define RECORD_FUNCTION_TAG             3
-#define RECORD_FUNCTON_TAG_LEN  4
+#define RECORD_FUNCTON_TAG_LEN          4
-#define RECORD_FUNCTION_IDENT   5
+#define RECORD_FUNCTION_IDENT           5
-#define RECORD_FUNCTION_CHECK   6
+#define RECORD_FUNCTION_CHECK_LINE      6
-#define RECORD_COUNT_TAG        7
+#define RECORD_FUNCTION_CHECK_CFG       7
-#define RECORD_COUNT_LEN        8
+#define RECORD_FUNCTION_NAME_LEN        8
-#define RECORD_COUNT            9
+#define RECORD_FUNCTION_NAME            9
+#define RECORD_COUNT_TAG                10
+#define RECORD_COUNT_LEN                11
+#define RECORD_COUNT                    12
+/* Return length of string encoded in GCOV format. */
+static size_t
+sizeof_str(const char *str)
+{
+        size_t len;
+        len = (str) ? strlen(str) : 0;
+        if (len == 0)
+                return 1;
+        return 1 + ((len + 4) >> 2);
+}
 /**
 * gcov_iter_next - advance file iterator to next logical record
@@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
        case RECORD_FUNCTON_TAG_LEN:
        case RECORD_FUNCTION_IDENT:
        case RECORD_COUNT_TAG:
+        case RECORD_FUNCTION_CHECK_LINE:
+        case RECORD_FUNCTION_CHECK_CFG:
+        case RECORD_FUNCTION_NAME_LEN:
                /* Advance to next record */
                iter->record++;
                break;
@@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter)
                /* fall through */
        case RECORD_COUNT_LEN:
                if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
-                        iter->record = 9;
+                        iter->record = 12;
                        break;
                }
                /* Advance to next counter type */
@@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
                iter->count = 0;
                iter->type++;
                /* fall through */
-        case RECORD_FUNCTION_CHECK:
+        case RECORD_FUNCTION_NAME:
                if (iter->type < iter->num_types) {
-                        iter->record = 7;
+                        iter->record = 10;
                        break;
                }
                /* Advance to next function */
@@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
        data[1] = (v >> 32);
        return seq_write(seq, data, sizeof(data));
 }
+/**
+ * seq_write_gcov_str - write string in gcov format to seq_file
+ * @seq: seq_file handle
+ * @str: string to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_str(struct seq_file *seq, const char *str)
+{
+        if (str) {
+                size_t len;
+                int str_off;
+                u32 data;
+                len = strlen(str);
+                for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) {
+                        memcpy(&data, (str + str_off * 4), 4);
+                        seq_write(seq, &data, sizeof(data));
+                }
+                data = 0;
+                memcpy(&data, (str + str_off * 4), (len - str_off * 4));
+                return seq_write(seq, &data, sizeof(data));
+        } else {
+                return 0;
+        }
+}
 /**
 * gcov_iter_write - write data for current pos to seq_file
@@ -421,13 +466,24 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
                rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
                break;
        case RECORD_FUNCTON_TAG_LEN:
-                rc = seq_write_gcov_u32(seq, 2);
+                rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH +
+                        (sizeof_str(get_func(iter)->name)));
                break;
        case RECORD_FUNCTION_IDENT:
                rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
                break;
-        case RECORD_FUNCTION_CHECK:
+        case RECORD_FUNCTION_CHECK_LINE:
-                rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+                rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum);
+                break;
+        case RECORD_FUNCTION_CHECK_CFG:
+                rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum);
+                break;
+        case RECORD_FUNCTION_NAME_LEN:
+                rc = seq_write_gcov_u32(seq,
+                        (sizeof_str(get_func(iter)->name) - 1));
+                break;
+        case RECORD_FUNCTION_NAME:
+                rc = seq_write_gcov_str(seq, get_func(iter)->name);
                break;
        case RECORD_COUNT_TAG:
                rc = seq_write_gcov_u32(seq,
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a..040c6980df0 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,9 +21,10 @@
 * gcc and need to be kept as close to the original definition as possible to
 * remain compatible.
 */
-#define GCOV_COUNTERS           5
+#define GCOV_COUNTERS           10
 #define GCOV_DATA_MAGIC         ((unsigned int) 0x67636461)
 #define GCOV_TAG_FUNCTION       ((unsigned int) 0x01000000)
+#define GCOV_TAG_FUNCTION_LENGTH 3
 #define GCOV_TAG_COUNTER_BASE   ((unsigned int) 0x01a10000)
 #define GCOV_TAG_FOR_COUNTER(count)                                     \
        (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
@@ -34,10 +35,38 @@ typedef long gcov_type;
 typedef long long gcov_type;
 #endif
+/*
+ * Source module info. The data structure is used in both runtime and
+ * profile-use phase.
+ */
+struct gcov_module_info {
+        unsigned int ident;
+/*
+ * This is overloaded to mean two things:
+ * (1) means FDO/LIPO in instrumented binary.
+ * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use.
+ */
+        unsigned int is_primary;
+        unsigned int is_exported;
+        unsigned int lang;
+        char *da_filename;
+        char *source_filename;
+        unsigned int num_quote_paths;
+        unsigned int num_bracket_paths;
+        unsigned int num_cpp_defines;
+        unsigned int num_cpp_includes;
+        unsigned int num_cl_args;
+        char *string_array[1];
+};
 /**
 * struct gcov_fn_info - profiling meta data per function
 * @ident: object file-unique function identifier
- * @checksum: function checksum
+ * @lineno_checksum: function lineno checksum
+ * @cfg_checksum: function cfg checksum
+ * @dc_offset: direct call offset
+ * @name: function name
 * @n_ctrs: number of values per counter type belonging to this function
 *
 * This data is generated by gcc during compilation and doesn't change
@@ -45,7 +74,10 @@ typedef long long gcov_type;
 */
 struct gcov_fn_info {
        unsigned int ident;
-        unsigned int checksum;
+        unsigned int lineno_checksum;
+        unsigned int cfg_checksum;
+        unsigned int dc_offset;
+        const char   *name;
        unsigned int n_ctrs[0];
 };
@@ -67,9 +99,11 @@ struct gcov_ctr_info {
 /**
 * struct gcov_info - profiling data per object file
 * @version: gcov version magic indicating the gcc version used for compilation
+ * @modinfo: additional module information
 * @next: list head for a singly-linked list
 * @stamp: time stamp
 * @filename: name of the associated gcov data file
+ * @eof_pos: end position of profile data
 * @n_functions: number of instrumented functions
 * @functions: function data
 * @ctr_mask: mask specifying which counter types are active
@@ -80,9 +114,11 @@ struct gcov_ctr_info {
 */
 struct gcov_info {
        unsigned int                    version;
+        struct gcov_module_info         *mod_info;
        struct gcov_info                *next;
        unsigned int                    stamp;
        const char                      *filename;
+        unsigned int                    eof_pos;
        unsigned int                    n_functions;
        const struct gcov_fn_info       *functions;
        unsigned int                    ctr_mask;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 11e89690382..2391745f656 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -887,10 +887,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             unsigned long newstate, int reprogram)
 {
+        struct timerqueue_node *next_timer;
        if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                goto out;
-        if (&timer->node == timerqueue_getnext(&base->active)) {
+        next_timer = timerqueue_getnext(&base->active);
+        timerqueue_del(&base->active, &timer->node);
+        if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
                /* Reprogram the clock event device. if enabled */
                if (reprogram && hrtimer_hres_active()) {
@@ -903,7 +906,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
                }
 #endif
        }
-        timerqueue_del(&base->active, &timer->node);
        if (!timerqueue_getnext(&base->active))
                base->cpu_base->active_bases &= ~(1 << base->index);
 out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab8..e972276f12f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        /*
         * Ensure the task is not frozen.
-         * Also, when a freshly created task is scheduled once, changes
+         * Also, skip vfork and any other user process that freezer should skip.
-         * its state to TASK_UNINTERRUPTIBLE without having ever been
-         * switched out once, it musn't be checked.
         */
-        if (unlikely(t->flags & PF_FROZEN || !switch_count))
+        if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+            return;
+        /*
+         * When a freshly created task is scheduled once, changes its state to
+         * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+         * musn't be checked.
+         */
+        if (unlikely(!switch_count))
                return;
        if (switch_count != t->last_switch_count) {
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1d051b38e0..5a38bf4de64 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER
 config GENERIC_IRQ_CHIP
       bool
+# Generic irq_domain hw <--> linux irq number translation
+config IRQ_DOMAIN
+        bool
 # Support forced irq threading
 config IRQ_FORCED_THREADING
       bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 73290056cfb..fff17381f0a 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,7 @@
 obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d5a3009da71..dc5114b4c16 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -178,7 +178,7 @@ void irq_shutdown(struct irq_desc *desc)
        desc->depth = 1;
        if (desc->irq_data.chip->irq_shutdown)
                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-        if (desc->irq_data.chip->irq_disable)
+        else if (desc->irq_data.chip->irq_disable)
                desc->irq_data.chip->irq_disable(&desc->irq_data);
        else
                desc->irq_data.chip->irq_mask(&desc->irq_data);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef4ffcdfa5..bd8e788d71e 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
        struct irq_devres match_data = { irq, dev_id };
-        free_irq(irq, dev_id);
        WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
                               &match_data));
+        free_irq(irq, dev_id);
 }
 EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 3a2cab407b9..e38544dddb1 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -246,7 +246,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
        for (i = gc->irq_base; msk; msk >>= 1, i++) {
-                if (!msk & 0x01)
+                if (!(msk & 0x01))
                        continue;
                if (flags & IRQ_GC_INIT_NESTED_LOCK)
@@ -301,7 +301,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
        raw_spin_unlock(&gc_lock);
        for (; msk; msk >>= 1, i++) {
-                if (!msk & 0x01)
+                if (!(msk & 0x01))
                        continue;
                /* Remove handler first. That will mask the irq line */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c60a50e66b..039b889ea05 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { }
 static inline int desc_node(struct irq_desc *desc) { return 0; }
 #endif
-static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
+                struct module *owner)
 {
        int cpu;
@@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
+        desc->owner = owner;
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
@@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc)
 static inline void free_masks(struct irq_desc *desc) { }
 #endif
-static struct irq_desc *alloc_desc(int irq, int node)
+static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 {
        struct irq_desc *desc;
        gfp_t gfp = GFP_KERNEL;
@@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        raw_spin_lock_init(&desc->lock);
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        desc_set_defaults(irq, desc, node);
+        desc_set_defaults(irq, desc, node, owner);
        return desc;
@@ -173,13 +175,14 @@ static void free_desc(unsigned int irq)
        kfree(desc);
 }
-static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+static int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                       struct module *owner)
 {
        struct irq_desc *desc;
        int i;
        for (i = 0; i < cnt; i++) {
-                desc = alloc_desc(start + i, node);
+                desc = alloc_desc(start + i, node, owner);
                if (!desc)
                        goto err;
                mutex_lock(&sparse_irq_lock);
@@ -227,7 +230,7 @@ int __init early_irq_init(void)
                nr_irqs = initcnt;
        for (i = 0; i < initcnt; i++) {
-                desc = alloc_desc(i, node);
+                desc = alloc_desc(i, node, NULL);
                set_bit(i, allocated_irqs);
                irq_insert_desc(i, desc);
        }
@@ -261,7 +264,7 @@ int __init early_irq_init(void)
                alloc_masks(&desc[i], GFP_KERNEL, node);
                raw_spin_lock_init(&desc[i].lock);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                desc_set_defaults(i, &desc[i], node);
+                desc_set_defaults(i, &desc[i], node, NULL);
        }
        return arch_early_irq_init();
 }
@@ -276,8 +279,16 @@ static void free_desc(unsigned int irq)
        dynamic_irq_cleanup(irq);
 }
-static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                              struct module *owner)
 {
+        u32 i;
+        for (i = 0; i < cnt; i++) {
+                struct irq_desc *desc = irq_to_desc(start + i);
+                desc->owner = owner;
+        }
        return start;
 }
@@ -333,11 +344,13 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
 * @from:       Start the search from this irq number
 * @cnt:        Number of consecutive irqs to allocate.
 * @node:       Preferred node on which the irq descriptor should be allocated
+ * @owner:      Owning module (can be NULL)
 *
 * Returns the first irq number or error code
 */
 int __ref
-irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+                  struct module *owner)
 {
        int start, ret;
@@ -366,13 +379,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
        bitmap_set(allocated_irqs, start, cnt);
        mutex_unlock(&sparse_irq_lock);
-        return alloc_descs(start, cnt, node);
+        return alloc_descs(start, cnt, node, owner);
 err:
        mutex_unlock(&sparse_irq_lock);
        return ret;
 }
-EXPORT_SYMBOL_GPL(irq_alloc_descs);
+EXPORT_SYMBOL_GPL(__irq_alloc_descs);
 /**
 * irq_reserve_irqs - mark irqs allocated
@@ -440,7 +453,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        unsigned long flags;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc_set_defaults(irq, desc, desc_node(desc));
+        desc_set_defaults(irq, desc, desc_node(desc), NULL);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 00000000000..b57a3776de4
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,184 @@
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+static LIST_HEAD(irq_domain_list);
+static DEFINE_MUTEX(irq_domain_mutex);
+/**
+ * irq_domain_add() - Register an irq_domain
+ * @domain: ptr to initialized irq_domain structure
+ *
+ * Registers an irq_domain structure.  The irq_domain must at a minimum be
+ * initialized with an ops structure pointer, and either a ->to_irq hook or
+ * a valid irq_base value.  Everything else is optional.
+ */
+void irq_domain_add(struct irq_domain *domain)
+{
+        struct irq_data *d;
+        int hwirq;
+        /*
+         * This assumes that the irq_domain owner has already allocated
+         * the irq_descs.  This block will be removed when support for dynamic
+         * allocation of irq_descs is added to irq_domain.
+         */
+        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                if (!d) {
+                        WARN(1, "error: assigning domain to non existant irq_desc");
+                        return;
+                }
+                if (d->domain) {
+                        /* things are broken; just report, don't clean up */
+                        WARN(1, "error: irq_desc already assigned to a domain");
+                        return;
+                }
+                d->domain = domain;
+                d->hwirq = hwirq;
+        }
+        mutex_lock(&irq_domain_mutex);
+        list_add(&domain->list, &irq_domain_list);
+        mutex_unlock(&irq_domain_mutex);
+}
+/**
+ * irq_domain_del() - Unregister an irq_domain
+ * @domain: ptr to registered irq_domain.
+ */
+void irq_domain_del(struct irq_domain *domain)
+{
+        struct irq_data *d;
+        int hwirq;
+        mutex_lock(&irq_domain_mutex);
+        list_del(&domain->list);
+        mutex_unlock(&irq_domain_mutex);
+        /* Clear the irq_domain assignments */
+        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d->domain = NULL;
+        }
+}
+#if defined(CONFIG_OF_IRQ)
+/**
+ * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
+ *
+ * Used by the device tree interrupt mapping code to translate a device tree
+ * interrupt specifier to a valid linux irq number.  Returns either a valid
+ * linux IRQ number or 0.
+ *
+ * When the caller no longer need the irq number returned by this function it
+ * should arrange to call irq_dispose_mapping().
+ */
+unsigned int irq_create_of_mapping(struct device_node *controller,
+                                   const u32 *intspec, unsigned int intsize)
+{
+        struct irq_domain *domain;
+        unsigned long hwirq;
+        unsigned int irq, type;
+        int rc = -EINVAL;
+        /* Find a domain which can translate the irq spec */
+        mutex_lock(&irq_domain_mutex);
+        list_for_each_entry(domain, &irq_domain_list, list) {
+                if (!domain->ops->dt_translate)
+                        continue;
+                rc = domain->ops->dt_translate(domain, controller,
+                                        intspec, intsize, &hwirq, &type);
+                if (rc == 0)
+                        break;
+        }
+        mutex_unlock(&irq_domain_mutex);
+        if (rc != 0)
+                return 0;
+        irq = irq_domain_to_irq(domain, hwirq);
+        if (type != IRQ_TYPE_NONE)
+                irq_set_irq_type(irq, type);
+        pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
+                 controller->full_name, (int)hwirq, irq, type);
+        return irq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+/**
+ * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
+ * @irq: linux irq number to be discarded
+ *
+ * Calling this function indicates the caller no longer needs a reference to
+ * the linux irq number returned by a prior call to irq_create_of_mapping().
+ */
+void irq_dispose_mapping(unsigned int irq)
+{
+        /*
+         * nothing yet; will be filled when support for dynamic allocation of
+         * irq_descs is added to irq_domain
+         */
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+int irq_domain_simple_dt_translate(struct irq_domain *d,
+                            struct device_node *controller,
+                            const u32 *intspec, unsigned int intsize,
+                            unsigned long *out_hwirq, unsigned int *out_type)
+{
+        if (d->of_node != controller)
+                return -EINVAL;
+        if (intsize < 1)
+                return -EINVAL;
+        *out_hwirq = intspec[0];
+        *out_type = IRQ_TYPE_NONE;
+        if (intsize > 1)
+                *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+        return 0;
+}
+struct irq_domain_ops irq_domain_simple_ops = {
+        .dt_translate = irq_domain_simple_dt_translate,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+/**
+ * irq_domain_create_simple() - Set up a 'simple' translation range
+ */
+void irq_domain_add_simple(struct device_node *controller, int irq_base)
+{
+        struct irq_domain *domain;
+        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+        if (!domain) {
+                WARN_ON(1);
+                return;
+        }
+        domain->irq_base = irq_base;
+        domain->of_node = of_node_get(controller);
+        domain->ops = &irq_domain_simple_ops;
+        irq_domain_add(domain);
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+void irq_domain_generate_simple(const struct of_device_id *match,
+                                u64 phys_base, unsigned int irq_start)
+{
+        struct device_node *node;
+        pr_info("looking for phys_base=%llx, irq_start=%i\n",
+                (unsigned long long) phys_base, (int) irq_start);
+        node = of_find_matching_node_by_address(NULL, match, phys_base);
+        if (node)
+                irq_domain_add_simple(node, irq_start);
+        else
+                pr_info("no node found\n");
+}
+EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
+#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a7840aeb0f..d6c4adc2804 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -620,8 +620,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
+        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
                if (test_and_clear_bit(IRQTF_RUNTHREAD,
                                       &action->thread_flags)) {
@@ -629,7 +630,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
                        return 0;
                }
                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
        }
+        __set_current_state(TASK_RUNNING);
        return -1;
 }
@@ -883,6 +886,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
+        if (!try_module_get(desc->owner))
+                return -ENODEV;
        /*
         * Some drivers like serial.c use request_irq() heavily,
         * so we have to be careful not to interfere with a
@@ -906,8 +911,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        nested = irq_settings_is_nested_thread(desc);
        if (nested) {
-                if (!new->thread_fn)
+                if (!new->thread_fn) {
-                        return -EINVAL;
+                        ret = -EINVAL;
+                        goto out_mput;
+                }
                /*
                 * Replace the primary handler which was provided from
                 * the driver for non nested interrupt handling by the
@@ -929,8 +936,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
                                   new->name);
-                if (IS_ERR(t))
+                if (IS_ERR(t)) {
-                        return PTR_ERR(t);
+                        ret = PTR_ERR(t);
+                        goto out_mput;
+                }
                /*
                 * We keep the reference to the task struct even if
                 * the thread dies to avoid that the interrupt code
@@ -1095,6 +1104,8 @@ out_thread:
                        kthread_stop(t);
                put_task_struct(t);
        }
+out_mput:
+        module_put(desc->owner);
        return ret;
 }
@@ -1203,6 +1214,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                put_task_struct(action->thread);
        }
+        module_put(desc->owner);
        return action;
 }
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c987..fe4b09cf829 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/syscore_ops.h>
 #include "internals.h"
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
-/**
+static void resume_irqs(bool want_early)
- * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
- *
- * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQS_SUSPENDED flag set.
- */
-void resume_device_irqs(void)
 {
        struct irq_desc *desc;
        int irq;
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
+                bool is_early = desc->action &&
+                        desc->action->flags & IRQF_EARLY_RESUME;
+                if (is_early != want_early)
+                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+        resume_irqs(true);
+}
+static struct syscore_ops irq_pm_syscore_ops = {
+        .resume         = irq_pm_syscore_resume,
+};
+static int __init irq_pm_init_ops(void)
+{
+        register_syscore_ops(&irq_pm_syscore_ops);
+        return 0;
+}
+device_initcall(irq_pm_init_ops);
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+        resume_irqs(false);
+}
 EXPORT_SYMBOL_GPL(resume_device_irqs);
 /**
@@ -70,8 +104,13 @@ int check_wakeup_irqs(void)
        for_each_irq_desc(irq, desc) {
                if (irqd_is_wakeup_set(&desc->irq_data)) {
-                        if (desc->istate & IRQS_PENDING)
+                        if (desc->istate & IRQS_PENDING) {
+                                pr_info("Wakeup IRQ %d %s pending, suspend aborted\n",
+                                        irq,
+                                        desc->action && desc->action->name ?
+                                        desc->action->name : "");
                                return -EBUSY;
+                        }
                        continue;
                }
                /*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c..ef60772d2fe 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,17 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
 */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-        /*
-         * We do not resend level type interrupts. Level type
-         * interrupts are resent by hardware when they are still
-         * active.
-         */
-        if (irq_settings_is_level(desc))
-                return;
-        if (desc->istate & IRQS_REPLAY)
-                return;
        if (desc->istate & IRQS_PENDING) {
                desc->istate &= ~IRQS_PENDING;
+                /*
+                 * We do not resend level type interrupts. Level type
+                 * interrupts are resent by hardware when they are still
+                 * active.
+                 */
+                if (irq_settings_is_level(desc))
+                        return;
+                if (desc->istate & IRQS_REPLAY)
+                        return;
                desc->istate |= IRQS_REPLAY;
                if (!desc->irq_data.chip->irq_retrigger ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index aa57d5da18c..dc813a948be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
         */
        action = desc->action;
        if (!action || !(action->flags & IRQF_SHARED) ||
-            (action->flags & __IRQF_TIMER) || !action->next)
+            (action->flags & __IRQF_TIMER) ||
+            (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
+            !action->next)
                goto out;
        /* Already running on another processor */
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq)
        struct irq_desc *desc;
        int i, ok = 0;
-        if (atomic_inc_return(&irq_poll_active) == 1)
+        if (atomic_inc_return(&irq_poll_active) != 1)
                goto out;
        irq_poll_cpu = smp_processor_id();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3..e6f1f24ad57 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key)
                return;
        jump_label_lock();
-        if (atomic_add_return(1, &key->enabled) == 1)
+        if (atomic_read(&key->enabled) == 0)
                jump_label_update(key, JUMP_LABEL_ENABLE);
+        atomic_inc(&key->enabled);
        jump_label_unlock();
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8d814cbc810..296fbc84d65 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void)
        size_t size = 0;
        mutex_lock(&kexec_mutex);
        if (crashk_res.end != crashk_res.start)
-                size = crashk_res.end - crashk_res.start + 1;
+                size = resource_size(&crashk_res);
        mutex_unlock(&kexec_mutex);
        return size;
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 47613dfb7b2..a4bea97c75b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
        atomic_inc(&kmod_concurrent);
        if (atomic_read(&kmod_concurrent) > max_modprobes) {
                /* We may be blaming an innocent here, but unlikely */
-                if (kmod_loop_msg++ < 5)
+                if (kmod_loop_msg < 5) {
                        printk(KERN_ERR
                               "request_module: runaway loop modprobe %s\n",
                               module_name);
+                        kmod_loop_msg++;
+                }
                atomic_dec(&kmod_concurrent);
                return -ENOMEM;
        }
@@ -274,7 +276,7 @@ static void __call_usermodehelper(struct work_struct *work)
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
 */
-static int usermodehelper_disabled;
+static int usermodehelper_disabled = 1;
 /* Number of helpers running */
 static atomic_t running_helpers = ATOMIC_INIT(0);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e..b30fd54eb98 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 /*
 * If we have a symbol_name argument, look it up and add the offset field
 * to it. This way, we can specify a relative address to a symbol.
+ * This returns encoded errors if it fails to look up symbol or invalid
+ * combination of parameters.
 */
 static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 {
        kprobe_opcode_t *addr = p->addr;
+        if ((p->symbol_name && p->addr) ||
+            (!p->symbol_name && !p->addr))
+                goto invalid;
        if (p->symbol_name) {
-                if (addr)
-                        return NULL;
                kprobe_lookup_name(p->symbol_name, addr);
+                if (!addr)
+                        return ERR_PTR(-ENOENT);
        }
-        if (!addr)
+        addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
-                return NULL;
+        if (addr)
-        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+                return addr;
+invalid:
+        return ERR_PTR(-EINVAL);
 }
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
        kprobe_opcode_t *addr;
        addr = kprobe_addr(p);
-        if (!addr)
+        if (IS_ERR(addr))
-                return -EINVAL;
+                return PTR_ERR(addr);
        p->addr = addr;
        ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
         */
        probed_mod = __module_text_address((unsigned long) p->addr);
        if (probed_mod) {
+                /* Return -ENOENT if fail. */
+                ret = -ENOENT;
                /*
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
                        module_put(probed_mod);
                        goto fail_with_jump_label;
                }
+                /* ret will be updated by following code */
        }
        preempt_enable();
        jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
 fail_with_jump_label:
        preempt_enable();
        jump_label_unlock();
-        return -EINVAL;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
-                if (!addr)
+                if (IS_ERR(addr))
-                        return -EINVAL;
+                        return PTR_ERR(addr);
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 298c9276dfd..447960603fb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -44,6 +44,7 @@
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
+#include <linux/kmemcheck.h>
 #include <asm/sections.h>
@@ -2468,6 +2469,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
                BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+                if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+                        continue;
                if (!mark_lock(curr, hlock, usage_bit))
                        return 0;
        }
@@ -2478,34 +2482,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 /*
 * Hardirqs will be enabled:
 */
-void trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(unsigned long ip)
 {
        struct task_struct *curr = current;
-        time_hardirqs_on(CALLER_ADDR0, ip);
-        if (unlikely(!debug_locks || current->lockdep_recursion))
-                return;
-        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
-                return;
-        if (unlikely(curr->hardirqs_enabled)) {
-                /*
-                 * Neither irq nor preemption are disabled here
-                 * so this is racy by nature but losing one hit
-                 * in a stat is not a big deal.
-                 */
-                __debug_atomic_inc(redundant_hardirqs_on);
-                return;
-        }
        /* we'll do an OFF -> ON transition: */
        curr->hardirqs_enabled = 1;
-        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-                return;
-        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
-                return;
        /*
         * We are going to turn hardirqs on, so set the
         * usage bit for all held locks:
@@ -2525,6 +2508,37 @@ void trace_hardirqs_on_caller(unsigned long ip)
        curr->hardirq_enable_event = ++curr->irq_events;
        debug_atomic_inc(hardirqs_on_events);
 }
+void trace_hardirqs_on_caller(unsigned long ip)
+{
+        time_hardirqs_on(CALLER_ADDR0, ip);
+        if (unlikely(!debug_locks || current->lockdep_recursion))
+                return;
+        if (unlikely(current->hardirqs_enabled)) {
+                /*
+                 * Neither irq nor preemption are disabled here
+                 * so this is racy by nature but losing one hit
+                 * in a stat is not a big deal.
+                 */
+                __debug_atomic_inc(redundant_hardirqs_on);
+                return;
+        }
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+                return;
+        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+                return;
+        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+                return;
+        current->lockdep_recursion = 1;
+        __trace_hardirqs_on_caller(ip);
+        current->lockdep_recursion = 0;
+}
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 void trace_hardirqs_on(void)
@@ -2574,7 +2588,7 @@ void trace_softirqs_on(unsigned long ip)
 {
        struct task_struct *curr = current;
-        if (unlikely(!debug_locks))
+        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2585,6 +2599,7 @@ void trace_softirqs_on(unsigned long ip)
                return;
        }
+        current->lockdep_recursion = 1;
        /*
         * We'll do an OFF -> ON transition:
         */
@@ -2599,6 +2614,7 @@ void trace_softirqs_on(unsigned long ip)
         */
        if (curr->hardirqs_enabled)
                mark_held_locks(curr, SOFTIRQ);
+        current->lockdep_recursion = 0;
 }
 /*
@@ -2608,7 +2624,7 @@ void trace_softirqs_off(unsigned long ip)
 {
        struct task_struct *curr = current;
-        if (unlikely(!debug_locks))
+        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2861,6 +2877,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 {
        int i;
+        kmemcheck_mark_initialized(lock, sizeof(*lock));
        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
                lock->class_cache[i] = NULL;
@@ -3099,7 +3117,13 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                if (!class)
                        class = look_up_lock_class(lock, 0);
-                if (DEBUG_LOCKS_WARN_ON(!class))
+                /*
+                 * If look_up_lock_class() failed to find a class, we're trying
+                 * to test if we hold a lock that has never yet been acquired.
+                 * Clearly if the lock hasn't been acquired _ever_, we're not
+                 * holding it either, so report failure.
+                 */
+                if (!class)
                        return 0;
                if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
diff --git a/kernel/module.c b/kernel/module.c
index 795bdc7f5c3..e0ddcece2be 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s)  \
        mod->field = kstrdup(s, GFP_KERNEL);                          \
 }                                                                     \
 static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
-                        struct module *mod, char *buffer)             \
+                        struct module_kobject *mk, char *buffer)      \
 {                                                                     \
-        return sprintf(buffer, "%s\n", mod->field);                   \
+        return sprintf(buffer, "%s\n", mk->mod->field);               \
 }                                                                     \
 static int modinfo_##field##_exists(struct module *mod)               \
 {                                                                     \
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr)
 EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
-                           struct module *mod, char *buffer)
+                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%u\n", module_refcount(mod));
+        return sprintf(buffer, "%u\n", module_refcount(mk->mod));
 }
 static struct module_attribute refcnt = {
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
 #endif /* CONFIG_MODULE_UNLOAD */
 static ssize_t show_initstate(struct module_attribute *mattr,
-                           struct module *mod, char *buffer)
+                              struct module_kobject *mk, char *buffer)
 {
        const char *state = "unknown";
-        switch (mod->state) {
+        switch (mk->mod->state) {
        case MODULE_STATE_LIVE:
                state = "live";
                break;
@@ -975,10 +975,27 @@ static struct module_attribute initstate = {
        .show = show_initstate,
 };
+static ssize_t store_uevent(struct module_attribute *mattr,
+                            struct module_kobject *mk,
+                            const char *buffer, size_t count)
+{
+        enum kobject_action action;
+        if (kobject_action_type(buffer, count, &action) == 0)
+                kobject_uevent(&mk->kobj, action);
+        return count;
+}
+struct module_attribute module_uevent = {
+        .attr = { .name = "uevent", .mode = 0200 },
+        .store = store_uevent,
+};
 static struct module_attribute *modinfo_attrs[] = {
        &modinfo_version,
        &modinfo_srcversion,
        &initstate,
+        &module_uevent,
 #ifdef CONFIG_MODULE_UNLOAD
        &refcnt,
 #endif
@@ -1187,7 +1204,7 @@ struct module_sect_attrs
 };
 static ssize_t module_sect_show(struct module_attribute *mattr,
-                                struct module *mod, char *buf)
+                                struct module_kobject *mk, char *buf)
 {
        struct module_sect_attr *sattr =
                container_of(mattr, struct module_sect_attr, mattr);
@@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { }
 static void unset_module_init_ro_nx(struct module *mod) { }
 #endif
+void __weak module_free(struct module *mod, void *module_region)
+{
+        vfree(module_region);
+}
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
        return ret;
 }
+int __weak apply_relocate(Elf_Shdr *sechdrs,
+                          const char *strtab,
+                          unsigned int symindex,
+                          unsigned int relsec,
+                          struct module *me)
+{
+        pr_err("module %s: REL relocation unsupported\n", me->name);
+        return -ENOEXEC;
+}
+int __weak apply_relocate_add(Elf_Shdr *sechdrs,
+                              const char *strtab,
+                              unsigned int symindex,
+                              unsigned int relsec,
+                              struct module *me)
+{
+        pr_err("module %s: RELA relocation unsupported\n", me->name);
+        return -ENOEXEC;
+}
 static int apply_relocations(struct module *mod, const struct load_info *info)
 {
        unsigned int i;
@@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
                ddebug_remove_module(debug->modname);
 }
+void * __weak module_alloc(unsigned long size)
+{
+        return size == 0 ? NULL : vmalloc_exec(size);
+}
 static void *module_alloc_update_bounds(unsigned long size)
 {
        void *ret = module_alloc(size);
@@ -2477,7 +2528,7 @@ static void find_module_sections(struct module *mod, struct load_info *info)
        mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
 #endif
 #ifdef CONFIG_CONSTRUCTORS
-        mod->ctors = section_objs(info, ".ctors",
+        mod->ctors = section_objs(info, CONFIG_GCOV_CTORS,
                                  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
@@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
        set_fs(old_fs);
 }
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+                                     Elf_Shdr *sechdrs,
+                                     char *secstrings,
+                                     struct module *mod)
+{
+        return 0;
+}
 static struct module *layout_and_allocate(struct load_info *info)
 {
        /* Module within temporary copy. */
@@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
        module_free(mod, mod->module_core);
 }
+int __weak module_finalize(const Elf_Ehdr *hdr,
+                           const Elf_Shdr *sechdrs,
+                           struct module *me)
+{
+        return 0;
+}
 static int post_relocation(struct module *mod, const struct load_info *info)
 {
        /* Sort exception table now relocations are done. */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb56..8d7b435806c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-/**
- *      register_reboot_notifier - Register function to be called at reboot time
- *      @nb: Info about notifier function to be called
- *
- *      Registers a function with the list of functions
- *      to be called at reboot time.
- *
- *      Currently always returns zero, as blocking_notifier_chain_register()
- *      always returns zero.
- */
-int register_reboot_notifier(struct notifier_block *nb)
-{
-        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(register_reboot_notifier);
-/**
- *      unregister_reboot_notifier - Unregister previously registered reboot notifier
- *      @nb: Hook to be unregistered
- *
- *      Unregisters a previously registered reboot
- *      notifier function.
- *
- *      Returns zero on success, or %-ENOENT on failure.
- */
-int unregister_reboot_notifier(struct notifier_block *nb)
-{
-        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(unregister_reboot_notifier);
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 int notrace __kprobes notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d6a00f3de15..9aeab4b98c6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ out:
        return err;
 }
-static int __init nsproxy_cache_init(void)
+int __init nsproxy_cache_init(void)
 {
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
        return 0;
 }
-module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 69231670eb9..41fc78ea3db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,13 +27,19 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
+/* Machine specific panic information string */
+char *mach_panic_string;
 int panic_on_oops;
 static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
-int panic_timeout;
+#ifndef CONFIG_PANIC_TIMEOUT
+#define CONFIG_PANIC_TIMEOUT 0
+#endif
+int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -119,6 +125,8 @@ NORET_TYPE void panic(const char * fmt, ...)
                        }
                        mdelay(PANIC_TIMER_STEP);
                }
+        }
+        if (panic_timeout != 0) {
                /*
                 * This will not be a clean reboot, with everything
                 * shutting down.  But if there is a chance of
@@ -342,6 +350,11 @@ late_initcall(init_oops_id);
 void print_oops_end_marker(void)
 {
        init_oops_id();
+        if (mach_panic_string)
+                printk(KERN_WARNING "Board Information: %s\n",
+                       mach_panic_string);
        printk(KERN_WARNING "---[ end trace %016llx ]---\n",
                (unsigned long long)oops_id);
 }
diff --git a/kernel/params.c b/kernel/params.c
index ed72e133086..22df3e0d142 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
                int ret;                                                \
                                                                        \
                ret = strtolfn(val, 0, &l);                             \
-                if (ret == -EINVAL || ((type)l != l))                   \
+                if (ret < 0 || ((type)l != l))                          \
-                        return -EINVAL;                                 \
+                        return ret < 0 ? ret : -EINVAL;                 \
                *((type *)kp->arg) = l;                                 \
                return 0;                                               \
        }                                                               \
@@ -511,7 +511,7 @@ struct module_param_attrs
 #define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
-                               struct module *mod, char *buf)
+                               struct module_kobject *mk, char *buf)
 {
        int count;
        struct param_attribute *attribute = to_param_attr(mattr);
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
 static ssize_t param_attr_store(struct module_attribute *mattr,
-                                struct module *owner,
+                                struct module_kobject *km,
                                const char *buf, size_t len)
 {
        int err;
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
                mk->kobj.kset = module_kset;
                err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
                                           "%s", name);
+#ifdef CONFIG_MODULES
+                if (!err)
+                        err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+#endif
                if (err) {
                        kobject_put(&mk->kobj);
                        printk(KERN_ERR
@@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void)
 }
 ssize_t __modver_version_show(struct module_attribute *mattr,
-                              struct module *mod, char *buf)
+                              struct module_kobject *mk, char *buf)
 {
        struct module_version_attribute *vattr =
                container_of(mattr, struct module_version_attribute, mattr);
@@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
        if (!attribute->show)
                return -EIO;
-        ret = attribute->show(attribute, mk->mod, buf);
+        ret = attribute->show(attribute, mk, buf);
        return ret;
 }
@@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
        if (!attribute->store)
                return -EIO;
-        ret = attribute->store(attribute, mk->mod, buf, len);
+        ret = attribute->store(attribute, mk, buf, len);
        return ret;
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346a270..e432057f3b2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
-                                              rcu_read_lock_held() ||
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6824ca7d4d0..82da7ac3b1f 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
+        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
+        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
+        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
@@ -104,11 +104,59 @@ static struct pm_qos_object network_throughput_pm_qos = {
 };
+static BLOCKING_NOTIFIER_HEAD(min_online_cpus_notifier);
+static struct pm_qos_object min_online_cpus_pm_qos = {
+        .requests = PLIST_HEAD_INIT(min_online_cpus_pm_qos.requests),
+        .notifiers = &min_online_cpus_notifier,
+        .name = "min_online_cpus",
+        .target_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE,
+        .default_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE,
+        .type = PM_QOS_MAX,
+};
+static BLOCKING_NOTIFIER_HEAD(max_online_cpus_notifier);
+static struct pm_qos_object max_online_cpus_pm_qos = {
+        .requests = PLIST_HEAD_INIT(max_online_cpus_pm_qos.requests),
+        .notifiers = &max_online_cpus_notifier,
+        .name = "max_online_cpus",
+        .target_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE,
+        .default_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE,
+        .type = PM_QOS_MIN,
+};
+static BLOCKING_NOTIFIER_HEAD(cpu_freq_min_notifier);
+static struct pm_qos_object cpu_freq_min_pm_qos = {
+        .requests = PLIST_HEAD_INIT(cpu_freq_min_pm_qos.requests),
+        .notifiers = &cpu_freq_min_notifier,
+        .name = "cpu_freq_min",
+        .target_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE,
+        .default_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE,
+        .type = PM_QOS_MAX,
+};
+static BLOCKING_NOTIFIER_HEAD(cpu_freq_max_notifier);
+static struct pm_qos_object cpu_freq_max_pm_qos = {
+        .requests = PLIST_HEAD_INIT(cpu_freq_max_pm_qos.requests),
+        .notifiers = &cpu_freq_max_notifier,
+        .name = "cpu_freq_max",
+        .target_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE,
+        .default_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE,
+        .type = PM_QOS_MIN,
+};
 static struct pm_qos_object *pm_qos_array[] = {
        &null_pm_qos,
        &cpu_dma_pm_qos,
        &network_lat_pm_qos,
-        &network_throughput_pm_qos
+        &network_throughput_pm_qos,
+        &min_online_cpus_pm_qos,
+        &max_online_cpus_pm_qos,
+        &cpu_freq_min_pm_qos,
+        &cpu_freq_max_pm_qos
 };
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -459,21 +507,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 static int __init pm_qos_power_init(void)
 {
        int ret = 0;
+        int i;
-        ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
-        if (ret < 0) {
-                printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+        for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
-                return ret;
+                ret = register_pm_qos_misc(pm_qos_array[i]);
-        }
+                if (ret < 0) {
-        ret = register_pm_qos_misc(&network_lat_pm_qos);
+                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-        if (ret < 0) {
+                               pm_qos_array[i]->name);
-                printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+                        return ret;
-                return ret;
+                }
        }
-        ret = register_pm_qos_misc(&network_throughput_pm_qos);
-        if (ret < 0)
-                printk(KERN_ERR
-                        "pm_qos_param: network_throughput setup failed\n");
        return ret;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e..640ded8f5c4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        do {
                times->utime = cputime_add(times->utime, t->utime);
                times->stime = cputime_add(times->stime, t->stime);
-                times->sum_exec_runtime += t->se.sum_exec_runtime;
+                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
        rcu_read_unlock();
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
        struct task_cputime sum;
        unsigned long flags;
-        spin_lock_irqsave(&cputimer->lock, flags);
        if (!cputimer->running) {
-                cputimer->running = 1;
                /*
                 * The POSIX timer interface allows for absolute time expiry
                 * values through the TIMER_ABSTIME flag, therefore we have
@@ -284,8 +282,11 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
                 * it.
                 */
                thread_group_cputime(tsk, &sum);
+                spin_lock_irqsave(&cputimer->lock, flags);
+                cputimer->running = 1;
                update_gt_cputime(&cputimer->cputime, &sum);
-        }
+        } else
+                spin_lock_irqsave(&cputimer->lock, flags);
        *times = cputimer->cputime;
        spin_unlock_irqrestore(&cputimer->lock, flags);
 }
@@ -312,7 +313,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                cpu->cpu = cputime.utime;
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = thread_group_sched_runtime(p);
+                thread_group_cputime(p, &cputime);
+                cpu->sched = cputime.sum_exec_runtime;
                break;
        }
        return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 87f4d24b55b..fcf5a834c4e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,73 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HAS_WAKELOCK
+        bool
+config HAS_EARLYSUSPEND
+        bool
+config WAKELOCK
+        bool "Wake lock"
+        depends on PM && RTC_CLASS
+        default n
+        select HAS_WAKELOCK
+        ---help---
+          Enable wakelocks. When user space request a sleep state the
+          sleep request will be delayed until no wake locks are held.
+config WAKELOCK_STAT
+        bool "Wake lock stats"
+        depends on WAKELOCK
+        default y
+        ---help---
+          Report wake lock stats in /proc/wakelocks
+config USER_WAKELOCK
+        bool "Userspace wake locks"
+        depends on WAKELOCK
+        default y
+        ---help---
+          User-space wake lock api. Write "lockname" or "lockname timeout"
+          to /sys/power/wake_lock lock and if needed create a wake lock.
+          Write "lockname" to /sys/power/wake_unlock to unlock a user wake
+          lock.
+config EARLYSUSPEND
+        bool "Early suspend"
+        depends on WAKELOCK
+        default y
+        select HAS_EARLYSUSPEND
+        ---help---
+          Call early suspend handlers when the user requested sleep state
+          changes.
+choice
+        prompt "User-space screen access"
+        default FB_EARLYSUSPEND if !FRAMEBUFFER_CONSOLE
+        default CONSOLE_EARLYSUSPEND
+        depends on HAS_EARLYSUSPEND
+        config NO_USER_SPACE_SCREEN_ACCESS_CONTROL
+                bool "None"
+        config CONSOLE_EARLYSUSPEND
+                bool "Console switch on early-suspend"
+                depends on HAS_EARLYSUSPEND && VT
+                ---help---
+                  Register early suspend handler to perform a console switch to
+                  when user-space should stop drawing to the screen and a switch
+                  back when it should resume.
+        config FB_EARLYSUSPEND
+                bool "Sysfs interface"
+                depends on HAS_EARLYSUSPEND
+                ---help---
+                  Register early suspend handler that notifies and waits for
+                  user-space through sysfs when user-space should stop drawing
+                  to the screen and notifies user-space when it should resume.
+endchoice
 config HIBERNATE_CALLBACKS
        bool
@@ -193,8 +260,8 @@ config APM_EMULATION
          notification of APM "events" (e.g. battery status change).
          In order to use APM, you will need supporting software. For location
-          and more information, read <file:Documentation/power/pm.txt> and the
+          and more information, read <file:Documentation/power/apm-acpi.txt>
-          Battery Powered Linux mini-HOWTO, available from
+          and the Battery Powered Linux mini-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>.
          This driver does not spin down disk drives (see the hdparm(8)
@@ -224,6 +291,21 @@ config PM_OPP
          implementations a ready to use framework to manage OPPs.
          For more information, read <file:Documentation/power/opp.txt>
-config PM_RUNTIME_CLK
+config PM_CLK
+        def_bool y
+        depends on PM && HAVE_CLK
+config PM_GENERIC_DOMAINS
+        bool
+        depends on PM
+config PM_GENERIC_DOMAINS_RUNTIME
        def_bool y
-        depends on PM_RUNTIME && HAVE_CLK
+        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+config SUSPEND_TIME
+        bool "Log time spent in suspend"
+        ---help---
+          Prints the time spent in suspend in the kernel log, and
+          keeps statistics on the time spent in suspend in
+          /sys/kernel/debug/suspend_time
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a9064..9b224e16b19 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,5 +8,11 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
+obj-$(CONFIG_WAKELOCK)          += wakelock.o
+obj-$(CONFIG_USER_WAKELOCK)     += userwakelock.o
+obj-$(CONFIG_EARLYSUSPEND)      += earlysuspend.o
+obj-$(CONFIG_CONSOLE_EARLYSUSPEND)      += consoleearlysuspend.o
+obj-$(CONFIG_FB_EARLYSUSPEND)   += fbearlysuspend.o
+obj-$(CONFIG_SUSPEND_TIME)      += suspend_time.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/consoleearlysuspend.c b/kernel/power/consoleearlysuspend.c
new file mode 100644
index 00000000000..a3edcb26738
--- /dev/null
+++ b/kernel/power/consoleearlysuspend.c
@@ -0,0 +1,78 @@
+/* kernel/power/consoleearlysuspend.c
+ *
+ * Copyright (C) 2005-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/console.h>
+#include <linux/earlysuspend.h>
+#include <linux/kbd_kern.h>
+#include <linux/module.h>
+#include <linux/vt_kern.h>
+#include <linux/wait.h>
+#define EARLY_SUSPEND_CONSOLE   (MAX_NR_CONSOLES-1)
+static int orig_fgconsole;
+static void console_early_suspend(struct early_suspend *h)
+{
+        acquire_console_sem();
+        orig_fgconsole = fg_console;
+        if (vc_allocate(EARLY_SUSPEND_CONSOLE))
+                goto err;
+        if (set_console(EARLY_SUSPEND_CONSOLE))
+                goto err;
+        release_console_sem();
+        if (vt_waitactive(EARLY_SUSPEND_CONSOLE + 1))
+                pr_warning("console_early_suspend: Can't switch VCs.\n");
+        return;
+err:
+        pr_warning("console_early_suspend: Can't set console\n");
+        release_console_sem();
+}
+static void console_late_resume(struct early_suspend *h)
+{
+        int ret;
+        acquire_console_sem();
+        ret = set_console(orig_fgconsole);
+        release_console_sem();
+        if (ret) {
+                pr_warning("console_late_resume: Can't set console.\n");
+                return;
+        }
+        if (vt_waitactive(orig_fgconsole + 1))
+                pr_warning("console_late_resume: Can't switch VCs.\n");
+}
+static struct early_suspend console_early_suspend_desc = {
+        .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
+        .suspend = console_early_suspend,
+        .resume = console_late_resume,
+};
+static int __init console_early_suspend_init(void)
+{
+        register_early_suspend(&console_early_suspend_desc);
+        return 0;
+}
+static void  __exit console_early_suspend_exit(void)
+{
+        unregister_early_suspend(&console_early_suspend_desc);
+}
+module_init(console_early_suspend_init);
+module_exit(console_early_suspend_exit);
diff --git a/kernel/power/earlysuspend.c b/kernel/power/earlysuspend.c
new file mode 100644
index 00000000000..b15f02eba45
--- /dev/null
+++ b/kernel/power/earlysuspend.c
@@ -0,0 +1,187 @@
+/* kernel/power/earlysuspend.c
+ *
+ * Copyright (C) 2005-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/earlysuspend.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rtc.h>
+#include <linux/syscalls.h> /* sys_sync */
+#include <linux/wakelock.h>
+#include <linux/workqueue.h>
+#include "power.h"
+enum {
+        DEBUG_USER_STATE = 1U << 0,
+        DEBUG_SUSPEND = 1U << 2,
+        DEBUG_VERBOSE = 1U << 3,
+};
+static int debug_mask = DEBUG_USER_STATE;
+module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
+static DEFINE_MUTEX(early_suspend_lock);
+static LIST_HEAD(early_suspend_handlers);
+static void early_suspend(struct work_struct *work);
+static void late_resume(struct work_struct *work);
+static DECLARE_WORK(early_suspend_work, early_suspend);
+static DECLARE_WORK(late_resume_work, late_resume);
+static DEFINE_SPINLOCK(state_lock);
+enum {
+        SUSPEND_REQUESTED = 0x1,
+        SUSPENDED = 0x2,
+        SUSPEND_REQUESTED_AND_SUSPENDED = SUSPEND_REQUESTED | SUSPENDED,
+};
+static int state;
+void register_early_suspend(struct early_suspend *handler)
+{
+        struct list_head *pos;
+        mutex_lock(&early_suspend_lock);
+        list_for_each(pos, &early_suspend_handlers) {
+                struct early_suspend *e;
+                e = list_entry(pos, struct early_suspend, link);
+                if (e->level > handler->level)
+                        break;
+        }
+        list_add_tail(&handler->link, pos);
+        if ((state & SUSPENDED) && handler->suspend)
+                handler->suspend(handler);
+        mutex_unlock(&early_suspend_lock);
+}
+EXPORT_SYMBOL(register_early_suspend);
+void unregister_early_suspend(struct early_suspend *handler)
+{
+        mutex_lock(&early_suspend_lock);
+        list_del(&handler->link);
+        mutex_unlock(&early_suspend_lock);
+}
+EXPORT_SYMBOL(unregister_early_suspend);
+static void early_suspend(struct work_struct *work)
+{
+        struct early_suspend *pos;
+        unsigned long irqflags;
+        int abort = 0;
+        mutex_lock(&early_suspend_lock);
+        spin_lock_irqsave(&state_lock, irqflags);
+        if (state == SUSPEND_REQUESTED)
+                state |= SUSPENDED;
+        else
+                abort = 1;
+        spin_unlock_irqrestore(&state_lock, irqflags);
+        if (abort) {
+                if (debug_mask & DEBUG_SUSPEND)
+                        pr_info("early_suspend: abort, state %d\n", state);
+                mutex_unlock(&early_suspend_lock);
+                goto abort;
+        }
+        if (debug_mask & DEBUG_SUSPEND)
+                pr_info("early_suspend: call handlers\n");
+        list_for_each_entry(pos, &early_suspend_handlers, link) {
+                if (pos->suspend != NULL) {
+                        if (debug_mask & DEBUG_VERBOSE)
+                                pr_info("early_suspend: calling %pf\n", pos->suspend);
+                        pos->suspend(pos);
+                }
+        }
+        mutex_unlock(&early_suspend_lock);
+        if (debug_mask & DEBUG_SUSPEND)
+                pr_info("early_suspend: sync\n");
+        sys_sync();
+abort:
+        spin_lock_irqsave(&state_lock, irqflags);
+        if (state == SUSPEND_REQUESTED_AND_SUSPENDED)
+                wake_unlock(&main_wake_lock);
+        spin_unlock_irqrestore(&state_lock, irqflags);
+}
+static void late_resume(struct work_struct *work)
+{
+        struct early_suspend *pos;
+        unsigned long irqflags;
+        int abort = 0;
+        mutex_lock(&early_suspend_lock);
+        spin_lock_irqsave(&state_lock, irqflags);
+        if (state == SUSPENDED)
+                state &= ~SUSPENDED;
+        else
+                abort = 1;
+        spin_unlock_irqrestore(&state_lock, irqflags);
+        if (abort) {
+                if (debug_mask & DEBUG_SUSPEND)
+                        pr_info("late_resume: abort, state %d\n", state);
+                goto abort;
+        }
+        if (debug_mask & DEBUG_SUSPEND)
+                pr_info("late_resume: call handlers\n");
+        list_for_each_entry_reverse(pos, &early_suspend_handlers, link) {
+                if (pos->resume != NULL) {
+                        if (debug_mask & DEBUG_VERBOSE)
+                                pr_info("late_resume: calling %pf\n", pos->resume);
+                        pos->resume(pos);
+                }
+        }
+        if (debug_mask & DEBUG_SUSPEND)
+                pr_info("late_resume: done\n");
+abort:
+        mutex_unlock(&early_suspend_lock);
+}
+void request_suspend_state(suspend_state_t new_state)
+{
+        unsigned long irqflags;
+        int old_sleep;
+        spin_lock_irqsave(&state_lock, irqflags);
+        old_sleep = state & SUSPEND_REQUESTED;
+        if (debug_mask & DEBUG_USER_STATE) {
+                struct timespec ts;
+                struct rtc_time tm;
+                getnstimeofday(&ts);
+                rtc_time_to_tm(ts.tv_sec, &tm);
+                pr_info("request_suspend_state: %s (%d->%d) at %lld "
+                        "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n",
+                        new_state != PM_SUSPEND_ON ? "sleep" : "wakeup",
+                        requested_suspend_state, new_state,
+                        ktime_to_ns(ktime_get()),
+                        tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+                        tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+        }
+        if (!old_sleep && new_state != PM_SUSPEND_ON) {
+                state |= SUSPEND_REQUESTED;
+                queue_work(suspend_work_queue, &early_suspend_work);
+        } else if (old_sleep && new_state == PM_SUSPEND_ON) {
+                state &= ~SUSPEND_REQUESTED;
+                wake_lock(&main_wake_lock);
+                queue_work(suspend_work_queue, &late_resume_work);
+        }
+        requested_suspend_state = new_state;
+        spin_unlock_irqrestore(&state_lock, irqflags);
+}
+suspend_state_t get_suspend_state(void)
+{
+        return requested_suspend_state;
+}
diff --git a/kernel/power/fbearlysuspend.c b/kernel/power/fbearlysuspend.c
new file mode 100644
index 00000000000..15137650149
--- /dev/null
+++ b/kernel/power/fbearlysuspend.c
@@ -0,0 +1,153 @@
+/* kernel/power/fbearlysuspend.c
+ *
+ * Copyright (C) 2005-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/earlysuspend.h>
+#include <linux/module.h>
+#include <linux/wait.h>
+#include "power.h"
+static wait_queue_head_t fb_state_wq;
+static DEFINE_SPINLOCK(fb_state_lock);
+static enum {
+        FB_STATE_STOPPED_DRAWING,
+        FB_STATE_REQUEST_STOP_DRAWING,
+        FB_STATE_DRAWING_OK,
+} fb_state;
+/* tell userspace to stop drawing, wait for it to stop */
+static void stop_drawing_early_suspend(struct early_suspend *h)
+{
+        int ret;
+        unsigned long irq_flags;
+        spin_lock_irqsave(&fb_state_lock, irq_flags);
+        fb_state = FB_STATE_REQUEST_STOP_DRAWING;
+        spin_unlock_irqrestore(&fb_state_lock, irq_flags);
+        wake_up_all(&fb_state_wq);
+        ret = wait_event_timeout(fb_state_wq,
+                                 fb_state == FB_STATE_STOPPED_DRAWING,
+                                 HZ);
+        if (unlikely(fb_state != FB_STATE_STOPPED_DRAWING))
+                pr_warning("stop_drawing_early_suspend: timeout waiting for "
+                           "userspace to stop drawing\n");
+}
+/* tell userspace to start drawing */
+static void start_drawing_late_resume(struct early_suspend *h)
+{
+        unsigned long irq_flags;
+        spin_lock_irqsave(&fb_state_lock, irq_flags);
+        fb_state = FB_STATE_DRAWING_OK;
+        spin_unlock_irqrestore(&fb_state_lock, irq_flags);
+        wake_up(&fb_state_wq);
+}
+static struct early_suspend stop_drawing_early_suspend_desc = {
+        .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
+        .suspend = stop_drawing_early_suspend,
+        .resume = start_drawing_late_resume,
+};
+static ssize_t wait_for_fb_sleep_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        char *s = buf;
+        int ret;
+        ret = wait_event_interruptible(fb_state_wq,
+                                       fb_state != FB_STATE_DRAWING_OK);
+        if (ret && fb_state == FB_STATE_DRAWING_OK)
+                return ret;
+        else
+                s += sprintf(buf, "sleeping");
+        return s - buf;
+}
+static ssize_t wait_for_fb_wake_show(struct kobject *kobj,
+                                     struct kobj_attribute *attr, char *buf)
+{
+        char *s = buf;
+        int ret;
+        unsigned long irq_flags;
+        spin_lock_irqsave(&fb_state_lock, irq_flags);
+        if (fb_state == FB_STATE_REQUEST_STOP_DRAWING) {
+                fb_state = FB_STATE_STOPPED_DRAWING;
+                wake_up(&fb_state_wq);
+        }
+        spin_unlock_irqrestore(&fb_state_lock, irq_flags);
+        ret = wait_event_interruptible(fb_state_wq,
+                                       fb_state == FB_STATE_DRAWING_OK);
+        if (ret && fb_state != FB_STATE_DRAWING_OK)
+                return ret;
+        else
+                s += sprintf(buf, "awake");
+        return s - buf;
+}
+#define power_ro_attr(_name) \
+static struct kobj_attribute _name##_attr = {   \
+        .attr   = {                             \
+                .name = __stringify(_name),     \
+                .mode = 0444,                   \
+        },                                      \
+        .show   = _name##_show,                 \
+        .store  = NULL,         \
+}
+power_ro_attr(wait_for_fb_sleep);
+power_ro_attr(wait_for_fb_wake);
+static struct attribute *g[] = {
+        &wait_for_fb_sleep_attr.attr,
+        &wait_for_fb_wake_attr.attr,
+        NULL,
+};
+static struct attribute_group attr_group = {
+        .attrs = g,
+};
+static int __init android_power_init(void)
+{
+        int ret;
+        init_waitqueue_head(&fb_state_wq);
+        fb_state = FB_STATE_DRAWING_OK;
+        ret = sysfs_create_group(power_kobj, &attr_group);
+        if (ret) {
+                pr_err("android_power_init: sysfs_create_group failed\n");
+                return ret;
+        }
+        register_early_suspend(&stop_drawing_early_suspend_desc);
+        return 0;
+}
+static void  __exit android_power_exit(void)
+{
+        unregister_early_suspend(&stop_drawing_early_suspend_desc);
+        sysfs_remove_group(power_kobj, &attr_group);
+}
+module_init(android_power_init);
+module_exit(android_power_exit);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2981af4ce7c..3304594553c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 int pm_notifier_call_chain(unsigned long val)
 {
-        return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
+        int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
-                        == NOTIFY_BAD) ? -EINVAL : 0;
+        return notifier_to_errno(ret);
 }
 /* If set, devices may be suspended and resumed asynchronously. */
@@ -170,7 +171,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
                           const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
+#ifdef CONFIG_EARLYSUSPEND
+        suspend_state_t state = PM_SUSPEND_ON;
+#else
        suspend_state_t state = PM_SUSPEND_STANDBY;
+#endif
        const char * const *s;
 #endif
        char *p;
@@ -192,8 +197,15 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
                        break;
        }
        if (state < PM_SUSPEND_MAX && *s)
+#ifdef CONFIG_EARLYSUSPEND
+                if (state == PM_SUSPEND_ON || valid_state(state)) {
+                        error = 0;
+                        request_suspend_state(state);
+                }
+#else
                error = enter_state(state);
 #endif
+#endif
 Exit:
        return error ? error : n;
@@ -297,6 +309,11 @@ power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_USER_WAKELOCK
+power_attr(wake_lock);
+power_attr(wake_unlock);
+#endif
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
@@ -309,6 +326,10 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#ifdef CONFIG_USER_WAKELOCK
+        &wake_lock_attr.attr,
+        &wake_unlock_attr.attr,
+#endif
 #endif
        NULL,
 };
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a2628..b6b9006480f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -245,3 +245,27 @@ static inline void suspend_thaw_processes(void)
 {
 }
 #endif
+#ifdef CONFIG_WAKELOCK
+/* kernel/power/wakelock.c */
+extern struct workqueue_struct *suspend_work_queue;
+extern struct wake_lock main_wake_lock;
+extern suspend_state_t requested_suspend_state;
+#endif
+#ifdef CONFIG_USER_WAKELOCK
+ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr,
+                        char *buf);
+ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr,
+                        const char *buf, size_t n);
+ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr,
+                        char *buf);
+ssize_t  wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr,
+                        const char *buf, size_t n);
+#endif
+#ifdef CONFIG_EARLYSUSPEND
+/* kernel/power/earlysuspend.c */
+void request_suspend_state(suspend_state_t state);
+suspend_state_t get_suspend_state(void);
+#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9..31338cdeafc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,6 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/workqueue.h>
+#include <linux/wakelock.h>
 /* 
 * Timeout for stopping processes
@@ -82,6 +83,10 @@ static int try_to_freeze_tasks(bool sig_only)
                        todo += wq_busy;
                }
+                if (todo && has_wake_lock(WAKE_LOCK_SUSPEND)) {
+                        wakeup = 1;
+                        break;
+                }
                if (!todo || time_after(jiffies, end_time))
                        break;
@@ -108,19 +113,25 @@ static int try_to_freeze_tasks(bool sig_only)
                 * and caller must call thaw_processes() if something fails),
                 * but it cleans up leftover PF_FREEZE requests.
                 */
-                printk("\n");
+                if(wakeup) {
-                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
+                        printk("\n");
-                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+                        printk(KERN_ERR "Freezing of %s aborted\n",
-                       wakeup ? "aborted" : "failed",
+                                        sig_only ? "user space " : "tasks ");
-                       elapsed_csecs / 100, elapsed_csecs % 100,
+                }
-                       todo - wq_busy, wq_busy);
+                else {
+                        printk("\n");
+                        printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
+                               "(%d tasks refusing to freeze, wq_busy=%d):\n",
+                               elapsed_csecs / 100, elapsed_csecs % 100,
+                               todo - wq_busy, wq_busy);
+                }
                thaw_workqueues();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
-                        if (!wakeup && freezing(p) && !freezer_should_skip(p))
+                        if (freezing(p) && !freezer_should_skip(p) &&
+                                elapsed_csecs > 100)
                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1c41ba21541..a6f6e3114a2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,9 @@
 #include "power.h"
 const char *const pm_states[PM_SUSPEND_MAX] = {
+#ifdef CONFIG_EARLYSUSPEND
+        [PM_SUSPEND_ON]         = "on",
+#endif
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
 };
@@ -44,6 +47,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
        suspend_ops = ops;
        mutex_unlock(&pm_mutex);
 }
+EXPORT_SYMBOL_GPL(suspend_set_ops);
 bool valid_state(suspend_state_t state)
 {
@@ -65,6 +69,7 @@ int suspend_valid_only_mem(suspend_state_t state)
 {
        return state == PM_SUSPEND_MEM;
 }
+EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
 static int suspend_test(int level)
 {
@@ -126,12 +131,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 /**
- *      suspend_enter - enter the desired system sleep state.
+ * suspend_enter - enter the desired system sleep state.
- *      @state:         state to enter
+ * @state: State to enter
+ * @wakeup: Returns information that suspend should not be entered again.
 *
- *      This function should be called after devices have been suspended.
+ * This function should be called after devices have been suspended.
 */
-static int suspend_enter(suspend_state_t state)
+static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
@@ -165,7 +171,8 @@ static int suspend_enter(suspend_state_t state)
        error = syscore_suspend();
        if (!error) {
-                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
+                *wakeup = pm_wakeup_pending();
+                if (!(suspend_test(TEST_CORE) || *wakeup)) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
@@ -199,6 +206,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
+        bool wakeup = false;
        if (!suspend_ops)
                return -ENOSYS;
@@ -220,7 +228,10 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (suspend_test(TEST_DEVICES))
                goto Recover_platform;
-        error = suspend_enter(state);
+        do {
+                error = suspend_enter(state, &wakeup);
+        } while (!error && !wakeup
+                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
        suspend_test_start();
@@ -307,7 +318,7 @@ int enter_state(suspend_state_t state)
 */
 int pm_suspend(suspend_state_t state)
 {
-        if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+        if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
                return enter_state(state);
        return -EINVAL;
 }
diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c
new file mode 100644
index 00000000000..d2a65da9f22
--- /dev/null
+++ b/kernel/power/suspend_time.c
@@ -0,0 +1,111 @@
+/*
+ * debugfs file to track time spent in suspend
+ *
+ * Copyright (c) 2011, Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/syscore_ops.h>
+#include <linux/time.h>
+static struct timespec suspend_time_before;
+static unsigned int time_in_suspend_bins[32];
+#ifdef CONFIG_DEBUG_FS
+static int suspend_time_debug_show(struct seq_file *s, void *data)
+{
+        int bin;
+        seq_printf(s, "time (secs)  count\n");
+        seq_printf(s, "------------------\n");
+        for (bin = 0; bin < 32; bin++) {
+                if (time_in_suspend_bins[bin] == 0)
+                        continue;
+                seq_printf(s, "%4d - %4d %4u\n",
+                        bin ? 1 << (bin - 1) : 0, 1 << bin,
+                                time_in_suspend_bins[bin]);
+        }
+        return 0;
+}
+static int suspend_time_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, suspend_time_debug_show, NULL);
+}
+static const struct file_operations suspend_time_debug_fops = {
+        .open           = suspend_time_debug_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init suspend_time_debug_init(void)
+{
+        struct dentry *d;
+        d = debugfs_create_file("suspend_time", 0755, NULL, NULL,
+                &suspend_time_debug_fops);
+        if (!d) {
+                pr_err("Failed to create suspend_time debug file\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+late_initcall(suspend_time_debug_init);
+#endif
+static int suspend_time_syscore_suspend(void)
+{
+        read_persistent_clock(&suspend_time_before);
+        return 0;
+}
+static void suspend_time_syscore_resume(void)
+{
+        struct timespec after;
+        read_persistent_clock(&after);
+        after = timespec_sub(after, suspend_time_before);
+        time_in_suspend_bins[fls(after.tv_sec)]++;
+        pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec,
+                after.tv_nsec / NSEC_PER_MSEC);
+}
+static struct syscore_ops suspend_time_syscore_ops = {
+        .suspend = suspend_time_syscore_suspend,
+        .resume = suspend_time_syscore_resume,
+};
+static int suspend_time_syscore_init(void)
+{
+        register_syscore_ops(&suspend_time_syscore_ops);
+        return 0;
+}
+static void suspend_time_syscore_exit(void)
+{
+        unregister_syscore_ops(&suspend_time_syscore_ops);
+}
+module_init(suspend_time_syscore_init);
+module_exit(suspend_time_syscore_exit);
diff --git a/kernel/power/userwakelock.c b/kernel/power/userwakelock.c
new file mode 100644
index 00000000000..a28a8db4146
--- /dev/null
+++ b/kernel/power/userwakelock.c
@@ -0,0 +1,219 @@
+/* kernel/power/userwakelock.c
+ *
+ * Copyright (C) 2005-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/wakelock.h>
+#include <linux/slab.h>
+#include "power.h"
+enum {
+        DEBUG_FAILURE   = BIT(0),
+        DEBUG_ERROR     = BIT(1),
+        DEBUG_NEW       = BIT(2),
+        DEBUG_ACCESS    = BIT(3),
+        DEBUG_LOOKUP    = BIT(4),
+};
+static int debug_mask = DEBUG_FAILURE;
+module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
+static DEFINE_MUTEX(tree_lock);
+struct user_wake_lock {
+        struct rb_node          node;
+        struct wake_lock        wake_lock;
+        char                    name[0];
+};
+struct rb_root user_wake_locks;
+static struct user_wake_lock *lookup_wake_lock_name(
+        const char *buf, int allocate, long *timeoutptr)
+{
+        struct rb_node **p = &user_wake_locks.rb_node;
+        struct rb_node *parent = NULL;
+        struct user_wake_lock *l;
+        int diff;
+        u64 timeout;
+        int name_len;
+        const char *arg;
+        /* Find length of lock name and start of optional timeout string */
+        arg = buf;
+        while (*arg && !isspace(*arg))
+                arg++;
+        name_len = arg - buf;
+        if (!name_len)
+                goto bad_arg;
+        while (isspace(*arg))
+                arg++;
+        /* Process timeout string */
+        if (timeoutptr && *arg) {
+                timeout = simple_strtoull(arg, (char **)&arg, 0);
+                while (isspace(*arg))
+                        arg++;
+                if (*arg)
+                        goto bad_arg;
+                /* convert timeout from nanoseconds to jiffies > 0 */
+                timeout += (NSEC_PER_SEC / HZ) - 1;
+                do_div(timeout, (NSEC_PER_SEC / HZ));
+                if (timeout <= 0)
+                        timeout = 1;
+                *timeoutptr = timeout;
+        } else if (*arg)
+                goto bad_arg;
+        else if (timeoutptr)
+                *timeoutptr = 0;
+        /* Lookup wake lock in rbtree */
+        while (*p) {
+                parent = *p;
+                l = rb_entry(parent, struct user_wake_lock, node);
+                diff = strncmp(buf, l->name, name_len);
+                if (!diff && l->name[name_len])
+                        diff = -1;
+                if (debug_mask & DEBUG_ERROR)
+                        pr_info("lookup_wake_lock_name: compare %.*s %s %d\n",
+                                name_len, buf, l->name, diff);
+                if (diff < 0)
+                        p = &(*p)->rb_left;
+                else if (diff > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return l;
+        }
+        /* Allocate and add new wakelock to rbtree */
+        if (!allocate) {
+                if (debug_mask & DEBUG_ERROR)
+                        pr_info("lookup_wake_lock_name: %.*s not found\n",
+                                name_len, buf);
+                return ERR_PTR(-EINVAL);
+        }
+        l = kzalloc(sizeof(*l) + name_len + 1, GFP_KERNEL);
+        if (l == NULL) {
+                if (debug_mask & DEBUG_FAILURE)
+                        pr_err("lookup_wake_lock_name: failed to allocate "
+                                "memory for %.*s\n", name_len, buf);
+                return ERR_PTR(-ENOMEM);
+        }
+        memcpy(l->name, buf, name_len);
+        if (debug_mask & DEBUG_NEW)
+                pr_info("lookup_wake_lock_name: new wake lock %s\n", l->name);
+        wake_lock_init(&l->wake_lock, WAKE_LOCK_SUSPEND, l->name);
+        rb_link_node(&l->node, parent, p);
+        rb_insert_color(&l->node, &user_wake_locks);
+        return l;
+bad_arg:
+        if (debug_mask & DEBUG_ERROR)
+                pr_info("lookup_wake_lock_name: wake lock, %.*s, bad arg, %s\n",
+                        name_len, buf, arg);
+        return ERR_PTR(-EINVAL);
+}
+ssize_t wake_lock_show(
+        struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+        char *s = buf;
+        char *end = buf + PAGE_SIZE;
+        struct rb_node *n;
+        struct user_wake_lock *l;
+        mutex_lock(&tree_lock);
+        for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) {
+                l = rb_entry(n, struct user_wake_lock, node);
+                if (wake_lock_active(&l->wake_lock))
+                        s += scnprintf(s, end - s, "%s ", l->name);
+        }
+        s += scnprintf(s, end - s, "\n");
+        mutex_unlock(&tree_lock);
+        return (s - buf);
+}
+ssize_t wake_lock_store(
+        struct kobject *kobj, struct kobj_attribute *attr,
+        const char *buf, size_t n)
+{
+        long timeout;
+        struct user_wake_lock *l;
+        mutex_lock(&tree_lock);
+        l = lookup_wake_lock_name(buf, 1, &timeout);
+        if (IS_ERR(l)) {
+                n = PTR_ERR(l);
+                goto bad_name;
+        }
+        if (debug_mask & DEBUG_ACCESS)
+                pr_info("wake_lock_store: %s, timeout %ld\n", l->name, timeout);
+        if (timeout)
+                wake_lock_timeout(&l->wake_lock, timeout);
+        else
+                wake_lock(&l->wake_lock);
+bad_name:
+        mutex_unlock(&tree_lock);
+        return n;
+}
+ssize_t wake_unlock_show(
+        struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+        char *s = buf;
+        char *end = buf + PAGE_SIZE;
+        struct rb_node *n;
+        struct user_wake_lock *l;
+        mutex_lock(&tree_lock);
+        for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) {
+                l = rb_entry(n, struct user_wake_lock, node);
+                if (!wake_lock_active(&l->wake_lock))
+                        s += scnprintf(s, end - s, "%s ", l->name);
+        }
+        s += scnprintf(s, end - s, "\n");
+        mutex_unlock(&tree_lock);
+        return (s - buf);
+}
+ssize_t wake_unlock_store(
+        struct kobject *kobj, struct kobj_attribute *attr,
+        const char *buf, size_t n)
+{
+        struct user_wake_lock *l;
+        mutex_lock(&tree_lock);
+        l = lookup_wake_lock_name(buf, 0, NULL);
+        if (IS_ERR(l)) {
+                n = PTR_ERR(l);
+                goto not_found;
+        }
+        if (debug_mask & DEBUG_ACCESS)
+                pr_info("wake_unlock_store: %s\n", l->name);
+        wake_unlock(&l->wake_lock);
+not_found:
+        mutex_unlock(&tree_lock);
+        return n;
+}
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 00000000000..81e1b7c65ca
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,634 @@
+/* kernel/power/wakelock.c
+ *
+ * Copyright (C) 2005-2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h> /* sys_sync */
+#include <linux/wakelock.h>
+#ifdef CONFIG_WAKELOCK_STAT
+#include <linux/proc_fs.h>
+#endif
+#include "power.h"
+enum {
+        DEBUG_EXIT_SUSPEND = 1U << 0,
+        DEBUG_WAKEUP = 1U << 1,
+        DEBUG_SUSPEND = 1U << 2,
+        DEBUG_EXPIRE = 1U << 3,
+        DEBUG_WAKE_LOCK = 1U << 4,
+};
+static int debug_mask = DEBUG_EXIT_SUSPEND | DEBUG_WAKEUP;
+module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
+#define WAKE_LOCK_TYPE_MASK              (0x0f)
+#define WAKE_LOCK_INITIALIZED            (1U << 8)
+#define WAKE_LOCK_ACTIVE                 (1U << 9)
+#define WAKE_LOCK_AUTO_EXPIRE            (1U << 10)
+#define WAKE_LOCK_PREVENTING_SUSPEND     (1U << 11)
+static DEFINE_SPINLOCK(list_lock);
+static LIST_HEAD(inactive_locks);
+static struct list_head active_wake_locks[WAKE_LOCK_TYPE_COUNT];
+static int current_event_num;
+struct workqueue_struct *suspend_work_queue;
+struct wake_lock main_wake_lock;
+suspend_state_t requested_suspend_state = PM_SUSPEND_MEM;
+static struct wake_lock unknown_wakeup;
+static struct wake_lock suspend_backoff_lock;
+#define SUSPEND_BACKOFF_THRESHOLD       10
+#define SUSPEND_BACKOFF_INTERVAL        10000
+static unsigned suspend_short_count;
+#ifdef CONFIG_WAKELOCK_STAT
+static struct wake_lock deleted_wake_locks;
+static ktime_t last_sleep_time_update;
+static int wait_for_wakeup;
+int get_expired_time(struct wake_lock *lock, ktime_t *expire_time)
+{
+        struct timespec ts;
+        struct timespec kt;
+        struct timespec tomono;
+        struct timespec delta;
+        struct timespec sleep;
+        long timeout;
+        if (!(lock->flags & WAKE_LOCK_AUTO_EXPIRE))
+                return 0;
+        get_xtime_and_monotonic_and_sleep_offset(&kt, &tomono, &sleep);
+        timeout = lock->expires - jiffies;
+        if (timeout > 0)
+                return 0;
+        jiffies_to_timespec(-timeout, &delta);
+        set_normalized_timespec(&ts, kt.tv_sec + tomono.tv_sec - delta.tv_sec,
+                                kt.tv_nsec + tomono.tv_nsec - delta.tv_nsec);
+        *expire_time = timespec_to_ktime(ts);
+        return 1;
+}
+static int print_lock_stat(struct seq_file *m, struct wake_lock *lock)
+{
+        int lock_count = lock->stat.count;
+        int expire_count = lock->stat.expire_count;
+        ktime_t active_time = ktime_set(0, 0);
+        ktime_t total_time = lock->stat.total_time;
+        ktime_t max_time = lock->stat.max_time;
+        ktime_t prevent_suspend_time = lock->stat.prevent_suspend_time;
+        if (lock->flags & WAKE_LOCK_ACTIVE) {
+                ktime_t now, add_time;
+                int expired = get_expired_time(lock, &now);
+                if (!expired)
+                        now = ktime_get();
+                add_time = ktime_sub(now, lock->stat.last_time);
+                lock_count++;
+                if (!expired)
+                        active_time = add_time;
+                else
+                        expire_count++;
+                total_time = ktime_add(total_time, add_time);
+                if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND)
+                        prevent_suspend_time = ktime_add(prevent_suspend_time,
+                                        ktime_sub(now, last_sleep_time_update));
+                if (add_time.tv64 > max_time.tv64)
+                        max_time = add_time;
+        }
+        return seq_printf(m,
+                     "\"%s\"\t%d\t%d\t%d\t%lld\t%lld\t%lld\t%lld\t%lld\n",
+                     lock->name, lock_count, expire_count,
+                     lock->stat.wakeup_count, ktime_to_ns(active_time),
+                     ktime_to_ns(total_time),
+                     ktime_to_ns(prevent_suspend_time), ktime_to_ns(max_time),
+                     ktime_to_ns(lock->stat.last_time));
+}
+static int wakelock_stats_show(struct seq_file *m, void *unused)
+{
+        unsigned long irqflags;
+        struct wake_lock *lock;
+        int ret;
+        int type;
+        spin_lock_irqsave(&list_lock, irqflags);
+        ret = seq_puts(m, "name\tcount\texpire_count\twake_count\tactive_since"
+                        "\ttotal_time\tsleep_time\tmax_time\tlast_change\n");
+        list_for_each_entry(lock, &inactive_locks, link)
+                ret = print_lock_stat(m, lock);
+        for (type = 0; type < WAKE_LOCK_TYPE_COUNT; type++) {
+                list_for_each_entry(lock, &active_wake_locks[type], link)
+                        ret = print_lock_stat(m, lock);
+        }
+        spin_unlock_irqrestore(&list_lock, irqflags);
+        return 0;
+}
+static void wake_unlock_stat_locked(struct wake_lock *lock, int expired)
+{
+        ktime_t duration;
+        ktime_t now;
+        if (!(lock->flags & WAKE_LOCK_ACTIVE))
+                return;
+        if (get_expired_time(lock, &now))
+                expired = 1;
+        else
+                now = ktime_get();
+        lock->stat.count++;
+        if (expired)
+                lock->stat.expire_count++;
+        duration = ktime_sub(now, lock->stat.last_time);
+        lock->stat.total_time = ktime_add(lock->stat.total_time, duration);
+        if (ktime_to_ns(duration) > ktime_to_ns(lock->stat.max_time))
+                lock->stat.max_time = duration;
+        lock->stat.last_time = ktime_get();
+        if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
+                duration = ktime_sub(now, last_sleep_time_update);
+                lock->stat.prevent_suspend_time = ktime_add(
+                        lock->stat.prevent_suspend_time, duration);
+                lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
+        }
+}
+static void update_sleep_wait_stats_locked(int done)
+{
+        struct wake_lock *lock;
+        ktime_t now, etime, elapsed, add;
+        int expired;
+        now = ktime_get();
+        elapsed = ktime_sub(now, last_sleep_time_update);
+        list_for_each_entry(lock, &active_wake_locks[WAKE_LOCK_SUSPEND], link) {
+                expired = get_expired_time(lock, &etime);
+                if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
+                        if (expired)
+                                add = ktime_sub(etime, last_sleep_time_update);
+                        else
+                                add = elapsed;
+                        lock->stat.prevent_suspend_time = ktime_add(
+                                lock->stat.prevent_suspend_time, add);
+                }
+                if (done || expired)
+                        lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
+                else
+                        lock->flags |= WAKE_LOCK_PREVENTING_SUSPEND;
+        }
+        last_sleep_time_update = now;
+}
+#endif
+static void expire_wake_lock(struct wake_lock *lock)
+{
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_unlock_stat_locked(lock, 1);
+#endif
+        lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
+        list_del(&lock->link);
+        list_add(&lock->link, &inactive_locks);
+        if (debug_mask & (DEBUG_WAKE_LOCK | DEBUG_EXPIRE))
+                pr_info("expired wake lock %s\n", lock->name);
+}
+/* Caller must acquire the list_lock spinlock */
+static void print_active_locks(int type)
+{
+        struct wake_lock *lock;
+        bool print_expired = true;
+        BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
+        list_for_each_entry(lock, &active_wake_locks[type], link) {
+                if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
+                        long timeout = lock->expires - jiffies;
+                        if (timeout > 0)
+                                pr_info("active wake lock %s, time left %ld\n",
+                                        lock->name, timeout);
+                        else if (print_expired)
+                                pr_info("wake lock %s, expired\n", lock->name);
+                } else {
+                        pr_info("active wake lock %s\n", lock->name);
+                        if (!(debug_mask & DEBUG_EXPIRE))
+                                print_expired = false;
+                }
+        }
+}
+static long has_wake_lock_locked(int type)
+{
+        struct wake_lock *lock, *n;
+        long max_timeout = 0;
+        BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
+        list_for_each_entry_safe(lock, n, &active_wake_locks[type], link) {
+                if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
+                        long timeout = lock->expires - jiffies;
+                        if (timeout <= 0)
+                                expire_wake_lock(lock);
+                        else if (timeout > max_timeout)
+                                max_timeout = timeout;
+                } else
+                        return -1;
+        }
+        return max_timeout;
+}
+long has_wake_lock(int type)
+{
+        long ret;
+        unsigned long irqflags;
+        spin_lock_irqsave(&list_lock, irqflags);
+        ret = has_wake_lock_locked(type);
+        if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND)
+                print_active_locks(type);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+        return ret;
+}
+static void suspend_backoff(void)
+{
+        pr_info("suspend: too many immediate wakeups, back off\n");
+        wake_lock_timeout(&suspend_backoff_lock,
+                          msecs_to_jiffies(SUSPEND_BACKOFF_INTERVAL));
+}
+static void suspend(struct work_struct *work)
+{
+        int ret;
+        int entry_event_num;
+        struct timespec ts_entry, ts_exit;
+        if (has_wake_lock(WAKE_LOCK_SUSPEND)) {
+                if (debug_mask & DEBUG_SUSPEND)
+                        pr_info("suspend: abort suspend\n");
+                return;
+        }
+        entry_event_num = current_event_num;
+        sys_sync();
+        if (debug_mask & DEBUG_SUSPEND)
+                pr_info("suspend: enter suspend\n");
+        getnstimeofday(&ts_entry);
+        ret = pm_suspend(requested_suspend_state);
+        getnstimeofday(&ts_exit);
+        if (debug_mask & DEBUG_EXIT_SUSPEND) {
+                struct rtc_time tm;
+                rtc_time_to_tm(ts_exit.tv_sec, &tm);
+                pr_info("suspend: exit suspend, ret = %d "
+                        "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", ret,
+                        tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+                        tm.tm_hour, tm.tm_min, tm.tm_sec, ts_exit.tv_nsec);
+        }
+        if (ts_exit.tv_sec - ts_entry.tv_sec <= 1) {
+                ++suspend_short_count;
+                if (suspend_short_count == SUSPEND_BACKOFF_THRESHOLD) {
+                        suspend_backoff();
+                        suspend_short_count = 0;
+                }
+        } else {
+                suspend_short_count = 0;
+        }
+        if (current_event_num == entry_event_num) {
+                if (debug_mask & DEBUG_SUSPEND)
+                        pr_info("suspend: pm_suspend returned with no event\n");
+                wake_lock_timeout(&unknown_wakeup, HZ / 2);
+        }
+}
+static DECLARE_WORK(suspend_work, suspend);
+static void expire_wake_locks(unsigned long data)
+{
+        long has_lock;
+        unsigned long irqflags;
+        if (debug_mask & DEBUG_EXPIRE)
+                pr_info("expire_wake_locks: start\n");
+        spin_lock_irqsave(&list_lock, irqflags);
+        if (debug_mask & DEBUG_SUSPEND)
+                print_active_locks(WAKE_LOCK_SUSPEND);
+        has_lock = has_wake_lock_locked(WAKE_LOCK_SUSPEND);
+        if (debug_mask & DEBUG_EXPIRE)
+                pr_info("expire_wake_locks: done, has_lock %ld\n", has_lock);
+        if (has_lock == 0)
+                queue_work(suspend_work_queue, &suspend_work);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+static DEFINE_TIMER(expire_timer, expire_wake_locks, 0, 0);
+static int power_suspend_late(struct device *dev)
+{
+        int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0;
+#ifdef CONFIG_WAKELOCK_STAT
+        wait_for_wakeup = !ret;
+#endif
+        if (debug_mask & DEBUG_SUSPEND)
+                pr_info("power_suspend_late return %d\n", ret);
+        return ret;
+}
+static struct dev_pm_ops power_driver_pm_ops = {
+        .suspend_noirq = power_suspend_late,
+};
+static struct platform_driver power_driver = {
+        .driver.name = "power",
+        .driver.pm = &power_driver_pm_ops,
+};
+static struct platform_device power_device = {
+        .name = "power",
+};
+void wake_lock_init(struct wake_lock *lock, int type, const char *name)
+{
+        unsigned long irqflags = 0;
+        if (name)
+                lock->name = name;
+        BUG_ON(!lock->name);
+        if (debug_mask & DEBUG_WAKE_LOCK)
+                pr_info("wake_lock_init name=%s\n", lock->name);
+#ifdef CONFIG_WAKELOCK_STAT
+        lock->stat.count = 0;
+        lock->stat.expire_count = 0;
+        lock->stat.wakeup_count = 0;
+        lock->stat.total_time = ktime_set(0, 0);
+        lock->stat.prevent_suspend_time = ktime_set(0, 0);
+        lock->stat.max_time = ktime_set(0, 0);
+        lock->stat.last_time = ktime_set(0, 0);
+#endif
+        lock->flags = (type & WAKE_LOCK_TYPE_MASK) | WAKE_LOCK_INITIALIZED;
+        INIT_LIST_HEAD(&lock->link);
+        spin_lock_irqsave(&list_lock, irqflags);
+        list_add(&lock->link, &inactive_locks);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+EXPORT_SYMBOL(wake_lock_init);
+void wake_lock_destroy(struct wake_lock *lock)
+{
+        unsigned long irqflags;
+        if (debug_mask & DEBUG_WAKE_LOCK)
+                pr_info("wake_lock_destroy name=%s\n", lock->name);
+        spin_lock_irqsave(&list_lock, irqflags);
+        lock->flags &= ~WAKE_LOCK_INITIALIZED;
+#ifdef CONFIG_WAKELOCK_STAT
+        if (lock->stat.count) {
+                deleted_wake_locks.stat.count += lock->stat.count;
+                deleted_wake_locks.stat.expire_count += lock->stat.expire_count;
+                deleted_wake_locks.stat.total_time =
+                        ktime_add(deleted_wake_locks.stat.total_time,
+                                  lock->stat.total_time);
+                deleted_wake_locks.stat.prevent_suspend_time =
+                        ktime_add(deleted_wake_locks.stat.prevent_suspend_time,
+                                  lock->stat.prevent_suspend_time);
+                deleted_wake_locks.stat.max_time =
+                        ktime_add(deleted_wake_locks.stat.max_time,
+                                  lock->stat.max_time);
+        }
+#endif
+        list_del(&lock->link);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+EXPORT_SYMBOL(wake_lock_destroy);
+static void wake_lock_internal(
+        struct wake_lock *lock, long timeout, int has_timeout)
+{
+        int type;
+        unsigned long irqflags;
+        long expire_in;
+        spin_lock_irqsave(&list_lock, irqflags);
+        type = lock->flags & WAKE_LOCK_TYPE_MASK;
+        BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
+        BUG_ON(!(lock->flags & WAKE_LOCK_INITIALIZED));
+#ifdef CONFIG_WAKELOCK_STAT
+        if (type == WAKE_LOCK_SUSPEND && wait_for_wakeup) {
+                if (debug_mask & DEBUG_WAKEUP)
+                        pr_info("wakeup wake lock: %s\n", lock->name);
+                wait_for_wakeup = 0;
+                lock->stat.wakeup_count++;
+        }
+        if ((lock->flags & WAKE_LOCK_AUTO_EXPIRE) &&
+            (long)(lock->expires - jiffies) <= 0) {
+                wake_unlock_stat_locked(lock, 0);
+                lock->stat.last_time = ktime_get();
+        }
+#endif
+        if (!(lock->flags & WAKE_LOCK_ACTIVE)) {
+                lock->flags |= WAKE_LOCK_ACTIVE;
+#ifdef CONFIG_WAKELOCK_STAT
+                lock->stat.last_time = ktime_get();
+#endif
+        }
+        list_del(&lock->link);
+        if (has_timeout) {
+                if (debug_mask & DEBUG_WAKE_LOCK)
+                        pr_info("wake_lock: %s, type %d, timeout %ld.%03lu\n",
+                                lock->name, type, timeout / HZ,
+                                (timeout % HZ) * MSEC_PER_SEC / HZ);
+                lock->expires = jiffies + timeout;
+                lock->flags |= WAKE_LOCK_AUTO_EXPIRE;
+                list_add_tail(&lock->link, &active_wake_locks[type]);
+        } else {
+                if (debug_mask & DEBUG_WAKE_LOCK)
+                        pr_info("wake_lock: %s, type %d\n", lock->name, type);
+                lock->expires = LONG_MAX;
+                lock->flags &= ~WAKE_LOCK_AUTO_EXPIRE;
+                list_add(&lock->link, &active_wake_locks[type]);
+        }
+        if (type == WAKE_LOCK_SUSPEND) {
+                current_event_num++;
+#ifdef CONFIG_WAKELOCK_STAT
+                if (lock == &main_wake_lock)
+                        update_sleep_wait_stats_locked(1);
+                else if (!wake_lock_active(&main_wake_lock))
+                        update_sleep_wait_stats_locked(0);
+#endif
+                if (has_timeout)
+                        expire_in = has_wake_lock_locked(type);
+                else
+                        expire_in = -1;
+                if (expire_in > 0) {
+                        if (debug_mask & DEBUG_EXPIRE)
+                                pr_info("wake_lock: %s, start expire timer, "
+                                        "%ld\n", lock->name, expire_in);
+                        mod_timer(&expire_timer, jiffies + expire_in);
+                } else {
+                        if (del_timer(&expire_timer))
+                                if (debug_mask & DEBUG_EXPIRE)
+                                        pr_info("wake_lock: %s, stop expire timer\n",
+                                                lock->name);
+                        if (expire_in == 0)
+                                queue_work(suspend_work_queue, &suspend_work);
+                }
+        }
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+void wake_lock(struct wake_lock *lock)
+{
+        wake_lock_internal(lock, 0, 0);
+}
+EXPORT_SYMBOL(wake_lock);
+void wake_lock_timeout(struct wake_lock *lock, long timeout)
+{
+        wake_lock_internal(lock, timeout, 1);
+}
+EXPORT_SYMBOL(wake_lock_timeout);
+void wake_unlock(struct wake_lock *lock)
+{
+        int type;
+        unsigned long irqflags;
+        spin_lock_irqsave(&list_lock, irqflags);
+        type = lock->flags & WAKE_LOCK_TYPE_MASK;
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_unlock_stat_locked(lock, 0);
+#endif
+        if (debug_mask & DEBUG_WAKE_LOCK)
+                pr_info("wake_unlock: %s\n", lock->name);
+        lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
+        list_del(&lock->link);
+        list_add(&lock->link, &inactive_locks);
+        if (type == WAKE_LOCK_SUSPEND) {
+                long has_lock = has_wake_lock_locked(type);
+                if (has_lock > 0) {
+                        if (debug_mask & DEBUG_EXPIRE)
+                                pr_info("wake_unlock: %s, start expire timer, "
+                                        "%ld\n", lock->name, has_lock);
+                        mod_timer(&expire_timer, jiffies + has_lock);
+                } else {
+                        if (del_timer(&expire_timer))
+                                if (debug_mask & DEBUG_EXPIRE)
+                                        pr_info("wake_unlock: %s, stop expire "
+                                                "timer\n", lock->name);
+                        if (has_lock == 0)
+                                queue_work(suspend_work_queue, &suspend_work);
+                }
+                if (lock == &main_wake_lock) {
+                        if (debug_mask & DEBUG_SUSPEND)
+                                print_active_locks(WAKE_LOCK_SUSPEND);
+#ifdef CONFIG_WAKELOCK_STAT
+                        update_sleep_wait_stats_locked(0);
+#endif
+                }
+        }
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+EXPORT_SYMBOL(wake_unlock);
+int wake_lock_active(struct wake_lock *lock)
+{
+        return !!(lock->flags & WAKE_LOCK_ACTIVE);
+}
+EXPORT_SYMBOL(wake_lock_active);
+static int wakelock_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, wakelock_stats_show, NULL);
+}
+static const struct file_operations wakelock_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = wakelock_stats_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __init wakelocks_init(void)
+{
+        int ret;
+        int i;
+        for (i = 0; i < ARRAY_SIZE(active_wake_locks); i++)
+                INIT_LIST_HEAD(&active_wake_locks[i]);
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_lock_init(&deleted_wake_locks, WAKE_LOCK_SUSPEND,
+                        "deleted_wake_locks");
+#endif
+        wake_lock_init(&main_wake_lock, WAKE_LOCK_SUSPEND, "main");
+        wake_lock(&main_wake_lock);
+        wake_lock_init(&unknown_wakeup, WAKE_LOCK_SUSPEND, "unknown_wakeups");
+        wake_lock_init(&suspend_backoff_lock, WAKE_LOCK_SUSPEND,
+                       "suspend_backoff");
+        ret = platform_device_register(&power_device);
+        if (ret) {
+                pr_err("wakelocks_init: platform_device_register failed\n");
+                goto err_platform_device_register;
+        }
+        ret = platform_driver_register(&power_driver);
+        if (ret) {
+                pr_err("wakelocks_init: platform_driver_register failed\n");
+                goto err_platform_driver_register;
+        }
+        suspend_work_queue = create_singlethread_workqueue("suspend");
+        if (suspend_work_queue == NULL) {
+                ret = -ENOMEM;
+                goto err_suspend_work_queue;
+        }
+#ifdef CONFIG_WAKELOCK_STAT
+        proc_create("wakelocks", S_IRUGO, NULL, &wakelock_stats_fops);
+#endif
+        return 0;
+err_suspend_work_queue:
+        platform_driver_unregister(&power_driver);
+err_platform_driver_register:
+        platform_device_unregister(&power_device);
+err_platform_device_register:
+        wake_lock_destroy(&suspend_backoff_lock);
+        wake_lock_destroy(&unknown_wakeup);
+        wake_lock_destroy(&main_wake_lock);
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_lock_destroy(&deleted_wake_locks);
+#endif
+        return ret;
+}
+static void  __exit wakelocks_exit(void)
+{
+#ifdef CONFIG_WAKELOCK_STAT
+        remove_proc_entry("wakelocks", NULL);
+#endif
+        destroy_workqueue(suspend_work_queue);
+        platform_driver_unregister(&power_driver);
+        platform_device_unregister(&power_device);
+        wake_lock_destroy(&suspend_backoff_lock);
+        wake_lock_destroy(&unknown_wakeup);
+        wake_lock_destroy(&main_wake_lock);
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_lock_destroy(&deleted_wake_locks);
+#endif
+}
+core_initcall(wakelocks_init);
+module_exit(wakelocks_exit);
diff --git a/kernel/printk.c b/kernel/printk.c
index b799a2ee96e..cbebc142be1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,6 +53,10 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define __LOG_BUF_LEN   (1 << CONFIG_LOG_BUF_SHIFT)
+#ifdef        CONFIG_DEBUG_LL
+extern void printascii(char *);
+#endif
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -297,6 +301,53 @@ static inline void boot_delay_msec(void)
 }
 #endif
+/*
+ * Return the number of unread characters in the log buffer.
+ */
+static int log_buf_get_len(void)
+{
+        return logged_chars;
+}
+/*
+ * Clears the ring-buffer
+ */
+void log_buf_clear(void)
+{
+        logged_chars = 0;
+}
+/*
+ * Copy a range of characters from the log buffer.
+ */
+int log_buf_copy(char *dest, int idx, int len)
+{
+        int ret, max;
+        bool took_lock = false;
+        if (!oops_in_progress) {
+                spin_lock_irq(&logbuf_lock);
+                took_lock = true;
+        }
+        max = log_buf_get_len();
+        if (idx < 0 || idx >= max) {
+                ret = -1;
+        } else {
+                if (len > max - idx)
+                        len = max - idx;
+                ret = len;
+                idx += (log_end - max);
+                while (len-- > 0)
+                        dest[len] = LOG_BUF(idx + len);
+        }
+        if (took_lock)
+                spin_unlock_irq(&logbuf_lock);
+        return ret;
+}
 #ifdef CONFIG_SECURITY_DMESG_RESTRICT
 int dmesg_restrict = 1;
 #else
@@ -325,8 +376,10 @@ static int check_syslog_permissions(int type, bool from_file)
                        return 0;
                /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
                if (capable(CAP_SYS_ADMIN)) {
-                        WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                        printk_once(KERN_WARNING "%s (%d): "
-                                 "but no CAP_SYSLOG (deprecated).\n");
+                                 "Attempt to access syslog with CAP_SYS_ADMIN "
+                                 "but no CAP_SYSLOG (deprecated).\n",
+                                 current->comm, task_pid_nr(current));
                        return 0;
                }
                return -EPERM;
@@ -789,7 +842,7 @@ static inline int can_use_console(unsigned int cpu)
 static int console_trylock_for_printk(unsigned int cpu)
        __releases(&logbuf_lock)
 {
-        int retval = 0;
+        int retval = 0, wake = 0;
        if (console_trylock()) {
                retval = 1;
@@ -802,12 +855,14 @@ static int console_trylock_for_printk(unsigned int cpu)
                 */
                if (!can_use_console(cpu)) {
                        console_locked = 0;
-                        up(&console_sem);
+                        wake = 1;
                        retval = 0;
                }
        }
        printk_cpu = UINT_MAX;
        spin_unlock(&logbuf_lock);
+        if (wake)
+                up(&console_sem);
        return retval;
 }
 static const char recursion_bug_msg [] =
@@ -882,6 +937,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        if (trace_override && !trace_recurse)
                TRACE("%s", printk_buf);
+#ifdef  CONFIG_DEBUG_LL
+        printascii(printk_buf);
+#endif
        p = printk_buf;
        /* Read log level and handle special printk prefix */
@@ -1156,7 +1215,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_ONLINE:
        case CPU_DEAD:
-        case CPU_DYING:
        case CPU_DOWN_FAILED:
        case CPU_UP_CANCELED:
                console_lock();
@@ -1252,7 +1310,7 @@ void console_unlock(void)
 {
        unsigned long flags;
        unsigned _con_start, _log_end;
-        unsigned wake_klogd = 0;
+        unsigned wake_klogd = 0, retry = 0;
        if (console_suspended) {
                up(&console_sem);
@@ -1261,6 +1319,7 @@ void console_unlock(void)
        console_may_schedule = 0;
+again:
        for ( ; ; ) {
                spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
@@ -1281,8 +1340,23 @@ void console_unlock(void)
        if (unlikely(exclusive_console))
                exclusive_console = NULL;
+        spin_unlock(&logbuf_lock);
        up(&console_sem);
+        /*
+         * Someone could have filled up the buffer again, so re-check if there's
+         * something to flush. In case we cannot trylock the console_sem again,
+         * there's a new owner and the console_unlock() from them will do the
+         * flush, no worries.
+         */
+        spin_lock(&logbuf_lock);
+        if (con_start != log_end)
+                retry = 1;
        spin_unlock_irqrestore(&logbuf_lock, flags);
+        if (retry && console_trylock())
+                goto again;
        if (wake_klogd)
                wake_up_klogd();
 }
@@ -1594,7 +1668,7 @@ static int __init printk_late_init(void)
        struct console *con;
        for_each_console(con) {
-                if (con->flags & CON_BOOT) {
+                if (!keep_bootcon && con->flags & CON_BOOT) {
                        printk(KERN_INFO "turn off boot console %s%d\n",
                                con->name, con->index);
                        unregister_console(con);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd..67d1fdd3c55 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
 #include <linux/uaccess.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
+static int ptrace_trapping_sleep_fn(void *flags)
+{
+        schedule();
+        return 0;
+}
 /*
 * ptrace a task: make the debugger its new parent and
 * move it to the ptrace list.
@@ -77,13 +84,31 @@ void __ptrace_unlink(struct task_struct *child)
        spin_lock(&child->sighand->siglock);
        /*
-         * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+         * Clear all pending traps and TRAPPING.  TRAPPING should be
+         * cleared regardless of JOBCTL_STOP_PENDING.  Do it explicitly.
+         */
+        task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+        task_clear_jobctl_trapping(child);
+        /*
+         * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
         * @child isn't dead.
         */
        if (!(child->flags & PF_EXITING) &&
            (child->signal->flags & SIGNAL_STOP_STOPPED ||
-             child->signal->group_stop_count))
+             child->signal->group_stop_count)) {
-                child->group_stop |= GROUP_STOP_PENDING;
+                child->jobctl |= JOBCTL_STOP_PENDING;
+                /*
+                 * This is only possible if this thread was cloned by the
+                 * traced task running in the stopped group, set the signal
+                 * for the future reports.
+                 * FIXME: we should change ptrace_init_task() to handle this
+                 * case.
+                 */
+                if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
+                        child->jobctl |= SIGSTOP;
+        }
        /*
         * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,16 +116,30 @@ void __ptrace_unlink(struct task_struct *child)
         * is in TASK_TRACED; otherwise, we might unduly disrupt
         * TASK_KILLABLE sleeps.
         */
-        if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+        if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
                signal_wake_up(child, task_is_traced(child));
        spin_unlock(&child->sighand->siglock);
 }
-/*
+/**
- * Check that we have indeed attached to the thing..
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations.  If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing.  If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
 */
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 {
        int ret = -ESRCH;
@@ -119,13 +158,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
                 */
                spin_lock_irq(&child->sighand->siglock);
                WARN_ON_ONCE(task_is_stopped(child));
-                if (task_is_traced(child) || kill)
+                if (ignore_state || (task_is_traced(child) &&
+                                     !(child->jobctl & JOBCTL_LISTENING)))
                        ret = 0;
                spin_unlock_irq(&child->sighand->siglock);
        }
        read_unlock(&tasklist_lock);
-        if (!ret && !kill)
+        if (!ret && !ignore_state)
                ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
        /* All systems go.. */
@@ -182,11 +222,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
        return !err;
 }
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+                         unsigned long flags)
 {
-        bool wait_trap = false;
+        bool seize = (request == PTRACE_SEIZE);
        int retval;
+        /*
+         * SEIZE will enable new ptrace behaviors which will be implemented
+         * gradually.  SEIZE_DEVEL is used to prevent applications
+         * expecting full SEIZE behaviors trapping on kernel commits which
+         * are still in the process of implementing them.
+         *
+         * Only test programs for new ptrace behaviors being implemented
+         * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
+         *
+         * Once SEIZE behaviors are completely implemented, this flag and
+         * the following test will be removed.
+         */
+        retval = -EIO;
+        if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+                goto out;
        audit_ptrace(task);
        retval = -EPERM;
@@ -218,16 +275,21 @@ static int ptrace_attach(struct task_struct *task)
                goto unlock_tasklist;
        task->ptrace = PT_PTRACED;
+        if (seize)
+                task->ptrace |= PT_SEIZED;
        if (task_ns_capable(task, CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
-        send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+        /* SEIZE doesn't trap tracee on attach */
+        if (!seize)
+                send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
        spin_lock(&task->sighand->siglock);
        /*
-         * If the task is already STOPPED, set GROUP_STOP_PENDING and
+         * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
         * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
         * will be cleared if the child completes the transition or any
         * event which clears the group stop states happens.  We'll wait
@@ -243,11 +305,9 @@ static int ptrace_attach(struct task_struct *task)
         * The following task_is_stopped() test is safe as both transitions
         * in and out of STOPPED are protected by siglock.
         */
-        if (task_is_stopped(task)) {
+        if (task_is_stopped(task) &&
-                task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+            task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
                signal_wake_up(task, 1);
-                wait_trap = true;
-        }
        spin_unlock(&task->sighand->siglock);
@@ -257,9 +317,12 @@ unlock_tasklist:
 unlock_creds:
        mutex_unlock(&task->signal->cred_guard_mutex);
 out:
-        if (wait_trap)
+        if (!retval) {
-                wait_event(current->signal->wait_chldexit,
+                wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
-                           !(task->group_stop & GROUP_STOP_TRAPPING));
+                            ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+                proc_ptrace_connector(task, PTRACE_ATTACH);
+        }
        return retval;
 }
@@ -322,25 +385,27 @@ static int ignoring_children(struct sighand_struct *sigh)
 */
 static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 {
+        bool dead;
        __ptrace_unlink(p);
-        if (p->exit_state == EXIT_ZOMBIE) {
+        if (p->exit_state != EXIT_ZOMBIE)
-                if (!task_detached(p) && thread_group_empty(p)) {
+                return false;
-                        if (!same_thread_group(p->real_parent, tracer))
-                                do_notify_parent(p, p->exit_signal);
+        dead = !thread_group_leader(p);
-                        else if (ignoring_children(tracer->sighand)) {
-                                __wake_up_parent(p, tracer);
+        if (!dead && thread_group_empty(p)) {
-                                p->exit_signal = -1;
+                if (!same_thread_group(p->real_parent, tracer))
-                        }
+                        dead = do_notify_parent(p, p->exit_signal);
-                }
+                else if (ignoring_children(tracer->sighand)) {
-                if (task_detached(p)) {
+                        __wake_up_parent(p, tracer);
-                        /* Mark it as in the process of being reaped. */
+                        dead = true;
-                        p->exit_state = EXIT_DEAD;
-                        return true;
                }
        }
+        /* Mark it as in the process of being reaped. */
-        return false;
+        if (dead)
+                p->exit_state = EXIT_DEAD;
+        return dead;
 }
 static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -365,6 +430,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
        }
        write_unlock_irq(&tasklist_lock);
+        proc_ptrace_connector(child, PTRACE_DETACH);
        if (unlikely(dead))
                release_task(child);
@@ -611,10 +677,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 int ptrace_request(struct task_struct *child, long request,
                   unsigned long addr, unsigned long data)
 {
+        bool seized = child->ptrace & PT_SEIZED;
        int ret = -EIO;
-        siginfo_t siginfo;
+        siginfo_t siginfo, *si;
        void __user *datavp = (void __user *) data;
        unsigned long __user *datalp = datavp;
+        unsigned long flags;
        switch (request) {
        case PTRACE_PEEKTEXT:
@@ -647,6 +715,59 @@ int ptrace_request(struct task_struct *child, long request,
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+        case PTRACE_INTERRUPT:
+                /*
+                 * Stop tracee without any side-effect on signal or job
+                 * control.  At least one trap is guaranteed to happen
+                 * after this request.  If @child is already trapped, the
+                 * current trap is not disturbed and another trap will
+                 * happen after the current trap is ended with PTRACE_CONT.
+                 *
+                 * The actual trap might not be PTRACE_EVENT_STOP trap but
+                 * the pending condition is cleared regardless.
+                 */
+                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+                        break;
+                /*
+                 * INTERRUPT doesn't disturb existing trap sans one
+                 * exception.  If ptracer issued LISTEN for the current
+                 * STOP, this INTERRUPT should clear LISTEN and re-trap
+                 * tracee into STOP.
+                 */
+                if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+                        signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+                unlock_task_sighand(child, &flags);
+                ret = 0;
+                break;
+        case PTRACE_LISTEN:
+                /*
+                 * Listen for events.  Tracee must be in STOP.  It's not
+                 * resumed per-se but is not considered to be in TRACED by
+                 * wait(2) or ptrace(2).  If an async event (e.g. group
+                 * stop state change) happens, tracee will enter STOP trap
+                 * again.  Alternatively, ptracer can issue INTERRUPT to
+                 * finish listening and re-trap tracee into STOP.
+                 */
+                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+                        break;
+                si = child->last_siginfo;
+                if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
+                        child->jobctl |= JOBCTL_LISTENING;
+                        /*
+                         * If NOTIFY is set, it means event happened between
+                         * start of this trap and now.  Trigger re-trap.
+                         */
+                        if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+                                signal_wake_up(child, true);
+                        ret = 0;
+                }
+                unlock_task_sighand(child, &flags);
+                break;
        case PTRACE_DETACH:      /* detach a process that was attached. */
                ret = ptrace_detach(child, data);
                break;
@@ -761,8 +882,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                goto out;
        }
-        if (request == PTRACE_ATTACH) {
+        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child);
+                ret = ptrace_attach(child, request, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -772,7 +893,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                goto out_put_task_struct;
        }
-        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+                                  request == PTRACE_INTERRUPT);
        if (ret < 0)
                goto out_put_task_struct;
@@ -903,8 +1025,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                goto out;
        }
-        if (request == PTRACE_ATTACH) {
+        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child);
+                ret = ptrace_attach(child, request, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -914,7 +1036,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                goto out_put_task_struct;
        }
-        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+                                  request == PTRACE_INTERRUPT);
        if (!ret)
                ret = compat_arch_ptrace(child, request, addr, data);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7784bd216b6..ddddb320be6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -37,7 +37,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e138db0338..98f51b13bb7 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -33,7 +33,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused)
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
        p = rcu_dereference_check(rcu_torture_current,
-                                  rcu_read_lock_held() ||
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
                                  srcu_read_lock_held(&srcu_ctl));
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg)
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
                p = rcu_dereference_check(rcu_torture_current,
-                                          rcu_read_lock_held() ||
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
                                          srcu_read_lock_held(&srcu_ctl));
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4e144876dc6..3b0c0986afc 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -31,7 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 3ff40178dce..c8dc249da5c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old,
                else
                        tmp.end = root->end;
+                if (tmp.end < tmp.start)
+                        goto next;
                resource_clip(&tmp, constraint->min, constraint->max);
                arch_remove_reservations(&tmp);
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old,
                                return 0;
                        }
                }
-                if (!this)
+next:           if (!this || this->end == root->end)
                        break;
                if (this != old)
                        tmp.start = this->end + 1;
                this = this->sibling;
@@ -553,6 +558,27 @@ int allocate_resource(struct resource *root, struct resource *new,
 EXPORT_SYMBOL(allocate_resource);
+/**
+ * lookup_resource - find an existing resource by a resource start address
+ * @root: root resource descriptor
+ * @start: resource start address
+ *
+ * Returns a pointer to the resource if found, NULL otherwise
+ */
+struct resource *lookup_resource(struct resource *root, resource_size_t start)
+{
+        struct resource *res;
+        read_lock(&resource_lock);
+        for (res = root->child; res; res = res->sibling) {
+                if (res->start == start)
+                        break;
+        }
+        read_unlock(&resource_lock);
+        return res;
+}
 /*
 * Insert a resource into the resource tree. If successful, return NULL,
 * otherwise return the conflicting resource (compare to __request_resource())
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf..255e1662acd 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
        lock->owner = NULL;
        raw_spin_lock_init(&lock->wait_lock);
-        plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
+        plist_head_init(&lock->wait_list);
        debug_rt_mutex_init(lock, name);
 }
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5..9f48f3d82e9 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -11,7 +11,7 @@
 #include <linux/rwsem.h>
 #include <asm/system.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * lock for reading
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_read_nested);
-void down_read_non_owner(struct rw_semaphore *sem)
-{
-        might_sleep();
-        __down_read(sem);
-}
-EXPORT_SYMBOL(down_read_non_owner);
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
        might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_write_nested);
-void up_read_non_owner(struct rw_semaphore *sem)
-{
-        __up_read(sem);
-}
-EXPORT_SYMBOL(up_read_non_owner);
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index c4b6bd5151f..a1bf2646d12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,10 +71,14 @@
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
+#include <linux/cpuacct.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
@@ -129,7 +133,7 @@ static void litmus_tick(struct rq*, struct task_struct*);
 static inline int rt_policy(int policy)
 {
-        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+        if (policy == SCHED_FIFO || policy == SCHED_RR)
                return 1;
        return 0;
 }
@@ -433,6 +437,7 @@ struct litmus_rq {
 */
 struct root_domain {
        atomic_t refcount;
+        atomic_t rto_count;
        struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
@@ -442,7 +447,6 @@ struct root_domain {
         * one runnable RT task.
         */
        cpumask_var_t rto_mask;
-        atomic_t rto_count;
        struct cpupri cpupri;
 };
@@ -540,6 +544,12 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+        u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        u64 prev_steal_time_rq;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
@@ -593,7 +603,6 @@ static inline int cpu_of(struct rq *rq)
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
-                              rcu_read_lock_held() || \
                              lockdep_is_held(&sched_domains_mutex))
 /*
@@ -1581,38 +1590,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return rq->avg_load_per_task;
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->load.weight;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->se[cpu]->load.weight;
-                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
-{
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-#endif
 #ifdef CONFIG_PREEMPT
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1966,10 +1943,28 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
 {
-        s64 irq_delta;
+        if (unlikely(steal > NSEC_PER_SEC))
+                return div_u64(steal, TICK_NSEC);
+        return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+        s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
        /*
@@ -1992,12 +1987,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->prev_irq_time += irq_delta;
        delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        if (static_branch((&paravirt_steal_rq_enabled))) {
+                u64 st;
+                steal = paravirt_steal_clock(cpu_of(rq));
+                steal -= rq->prev_steal_time_rq;
+                if (unlikely(steal > delta))
+                        steal = delta;
+                st = steal_ticks(steal);
+                steal = st * TICK_NSEC;
+                rq->prev_steal_time_rq += steal;
+                delta -= steal;
+        }
+#endif
        rq->clock_task += delta;
-        if (irq_delta && sched_feat(NONIRQ_POWER))
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-                sched_rt_avg_update(rq, irq_delta);
+        if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+                sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2032,12 +2050,7 @@ static int irqtime_account_si_update(void)
 #define sched_clock_irqtime     (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif
-{
-        rq->clock_task += delta;
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2238,7 +2251,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        if (task_cpu(p) != new_cpu) {
                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
        }
        __set_task_cpu(p, new_cpu);
@@ -2515,7 +2528,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
-        if (unlikely(rq->idle_stamp)) {
+        if (rq->idle_stamp) {
                u64 delta = rq->clock - rq->idle_stamp;
                u64 max = 2*sysctl_sched_migration_cost;
@@ -2927,7 +2940,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
@@ -3096,7 +3109,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-        perf_event_task_sched_in(current);
+        perf_event_task_sched_in(prev, current);
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3775,30 +3788,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 }
 /*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-        struct task_cputime totals;
-        unsigned long flags;
-        struct rq *rq;
-        u64 ns;
-        rq = task_rq_lock(p, &flags);
-        thread_group_cputime(p, &totals);
-        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, p, &flags);
-        return ns;
-}
-/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in user space since the last update
@@ -3939,6 +3928,25 @@ void account_idle_time(cputime_t cputime)
                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+        if (static_branch(&paravirt_steal_enabled)) {
+                u64 steal, st = 0;
+                steal = paravirt_steal_clock(smp_processor_id());
+                steal -= this_rq()->prev_steal_time;
+                st = steal_ticks(steal);
+                this_rq()->prev_steal_time += st * TICK_NSEC;
+                account_steal_time(st);
+                return st;
+        }
+#endif
+        return false;
+}
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3970,6 +3978,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        if (steal_account_process_tick())
+                return;
        if (irqtime_account_hi_update()) {
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        } else if (irqtime_account_si_update()) {
@@ -4023,6 +4034,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
+        if (steal_account_process_tick())
+                return;
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4320,9 +4334,9 @@ pick_next_task(struct rq *rq)
 }
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
 */
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
@@ -4371,16 +4385,6 @@ litmus_need_resched_nonpreemptible:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                        /*
-                         * If we are going to sleep and we have plugged IO
-                         * queued, make sure to submit it to avoid deadlocks.
-                         */
-                        if (blk_needs_flush_plug(prev)) {
-                                raw_spin_unlock(&rq->lock);
-                                blk_schedule_flush_plug(prev);
-                                raw_spin_lock(&rq->lock);
-                        }
                }
                switch_count = &prev->nvcsw;
        }
@@ -4436,17 +4440,34 @@ litmus_need_resched_nonpreemptible:
        srp_ceiling_block();
 }
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+        if (!tsk->state)
+                return;
+        /*
+         * If we are going to sleep and we have plugged IO queued,
+         * make sure to submit it to avoid deadlocks.
+         */
+        if (blk_needs_flush_plug(tsk))
+                blk_schedule_flush_plug(tsk);
+}
+asmlinkage void __sched schedule(void)
+{
+        struct task_struct *tsk = current;
+        sched_submit_work(tsk);
+        __schedule();
+}
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
-        bool ret = false;
-        rcu_read_lock();
        if (lock->owner != owner)
-                goto fail;
+                return false;
        /*
         * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4456,11 +4477,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
         */
        barrier();
-        ret = owner->on_cpu;
+        return owner->on_cpu;
-fail:
-        rcu_read_unlock();
-        return ret;
 }
 /*
@@ -4472,21 +4489,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
        if (!sched_feat(OWNER_SPIN))
                return 0;
+        rcu_read_lock();
        while (owner_running(lock, owner)) {
                if (need_resched())
-                        return 0;
+                        break;
                arch_mutex_cpu_relax();
        }
+        rcu_read_unlock();
        /*
-         * If the owner changed to another task there is likely
+         * We break out the loop above on need_resched() and when the
-         * heavy contention, stop spinning.
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when lock->owner is NULL.
         */
-        if (lock->owner)
+        return lock->owner == NULL;
-                return 0;
-        return 1;
 }
 #endif
@@ -4509,7 +4526,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
        do {
                add_preempt_count_notrace(PREEMPT_ACTIVE);
-                schedule();
+                __schedule();
                sub_preempt_count_notrace(PREEMPT_ACTIVE);
                /*
@@ -4537,7 +4554,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
-                schedule();
+                __schedule();
                local_irq_disable();
                sub_preempt_count(PREEMPT_ACTIVE);
@@ -5682,7 +5699,7 @@ static inline int should_resched(void)
 static void __cond_resched(void)
 {
        add_preempt_count(PREEMPT_ACTIVE);
-        schedule();
+        __schedule();
        sub_preempt_count(PREEMPT_ACTIVE);
 }
@@ -6618,7 +6635,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_ONLINE:
+        case CPU_STARTING:
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -7537,6 +7554,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
                        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
                        if (sd && (sd->flags & SD_OVERLAP))
                                free_sched_groups(sd->groups, 0);
+                        kfree(*per_cpu_ptr(sdd->sd, j));
                        kfree(*per_cpu_ptr(sdd->sg, j));
                        kfree(*per_cpu_ptr(sdd->sgp, j));
                }
@@ -8022,17 +8040,10 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT;
        INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        cfs_rq->rq = rq;
-        /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-        cfs_rq->load_stamp = 1;
-#endif
-#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -8052,27 +8063,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
        rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-        plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+        plist_head_init(&rt_rq->pushable_tasks);
 #endif
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-#ifdef CONFIG_RT_GROUP_SCHED
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8081,11 +8083,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->cfs_rq[cpu] = cfs_rq;
-        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
+        cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+        /* allow initial update_cfs_load() to truncate */
+        cfs_rq->load_stamp = 1;
+#endif
+        tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -8108,12 +8116,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->rt_rq[cpu] = rt_rq;
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        init_rt_rq(rt_rq, rq);
+        rt_rq->rt_nr_boosted = 0;
+        rt_rq->rq = rq;
        rt_rq->tg = tg;
-        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+        tg->rt_rq[cpu] = rt_rq;
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
                return;
@@ -8195,7 +8205,7 @@ void __init sched_init(void)
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
-                init_cfs_rq(&rq->cfs, rq);
+                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = root_task_group_load;
@@ -8266,7 +8276,7 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+        plist_head_init(&init_task.pi_waiters);
 #endif
        /*
@@ -8300,6 +8310,7 @@ void __init sched_init(void)
        atomic_set(&nohz.load_balancer, nr_cpu_ids);
        atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
        atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
+        nohz.next_balance = jiffies;
 #endif
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
@@ -8309,7 +8320,7 @@ void __init sched_init(void)
        scheduler_running = 1;
 }
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8317,13 +8328,23 @@ static inline int preempt_count_equals(int preempt_offset)
        return (nested == preempt_offset);
 }
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+        __might_sleep_init_called = 1;
+        return 0;
+}
+early_initcall(__might_sleep_init);
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
-#ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
-            system_state != SYSTEM_RUNNING || oops_in_progress)
+            oops_in_progress)
+                return;
+        if (system_state != SYSTEM_RUNNING &&
+            (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                return;
@@ -8341,7 +8362,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        if (irqs_disabled())
                print_irqtrace_events(current);
        dump_stack();
-#endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
@@ -8500,6 +8520,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
+                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
@@ -8527,7 +8548,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8548,7 +8569,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
        int i;
-        destroy_rt_bandwidth(&tg->rt_bandwidth);
+        if (tg->rt_se)
+                destroy_rt_bandwidth(&tg->rt_bandwidth);
        for_each_possible_cpu(i) {
                if (tg->rt_rq)
@@ -8589,6 +8611,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
+                init_rt_rq(rt_rq, cpu_rq(i));
+                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -9067,6 +9091,20 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 }
 static int
+cpu_cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+        const struct cred *cred = current_cred(), *tcred;
+        tcred = __task_cred(tsk);
+        if ((current != tsk) && !capable(CAP_SYS_NICE) &&
+            cred->euid != tcred->uid && cred->euid != tcred->suid)
+                return -EACCES;
+        return 0;
+}
+static int
 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9171,6 +9209,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
+        .allow_attach   = cpu_cgroup_allow_attach,
        .can_attach_task = cpu_cgroup_can_attach_task,
        .attach_task    = cpu_cgroup_attach_task,
        .exit           = cpu_cgroup_exit,
@@ -9197,8 +9236,30 @@ struct cpuacct {
        u64 __percpu *cpuusage;
        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
+        struct cpuacct_charge_calls *cpufreq_fn;
+        void *cpuacct_data;
 };
+static struct cpuacct *cpuacct_root;
+/* Default calls for cpufreq accounting */
+static struct cpuacct_charge_calls *cpuacct_cpufreq;
+int cpuacct_register_cpufreq(struct cpuacct_charge_calls *fn)
+{
+        cpuacct_cpufreq = fn;
+        /*
+         * Root node is created before platform can register callbacks,
+         * initalize here.
+         */
+        if (cpuacct_root && fn) {
+                cpuacct_root->cpufreq_fn = fn;
+                if (fn->init)
+                        fn->init(&cpuacct_root->cpuacct_data);
+        }
+        return 0;
+}
 struct cgroup_subsys cpuacct_subsys;
 /* return cpu accounting group corresponding to this container */
@@ -9233,8 +9294,16 @@ static struct cgroup_subsys_state *cpuacct_create(
                if (percpu_counter_init(&ca->cpustat[i], 0))
                        goto out_free_counters;
+        ca->cpufreq_fn = cpuacct_cpufreq;
+        /* If available, have platform code initalize cpu frequency table */
+        if (ca->cpufreq_fn && ca->cpufreq_fn->init)
+                ca->cpufreq_fn->init(&ca->cpuacct_data);
        if (cgrp->parent)
                ca->parent = cgroup_ca(cgrp->parent);
+        else
+                cpuacct_root = ca;
        return &ca->css;
@@ -9362,6 +9431,32 @@ static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
        return 0;
 }
+static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        if (ca->cpufreq_fn && ca->cpufreq_fn->cpufreq_show)
+                ca->cpufreq_fn->cpufreq_show(ca->cpuacct_data, cb);
+        return 0;
+}
+/* return total cpu power usage (milliWatt second) of a group */
+static u64 cpuacct_powerusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        int i;
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        u64 totalpower = 0;
+        if (ca->cpufreq_fn && ca->cpufreq_fn->power_usage)
+                for_each_present_cpu(i) {
+                        totalpower += ca->cpufreq_fn->power_usage(
+                                        ca->cpuacct_data);
+                }
+        return totalpower;
+}
 static struct cftype files[] = {
        {
                .name = "usage",
@@ -9376,6 +9471,14 @@ static struct cftype files[] = {
                .name = "stat",
                .read_map = cpuacct_stats_show,
        },
+        {
+                .name =  "cpufreq",
+                .read_map = cpuacct_cpufreq_show,
+        },
+        {
+                .name = "power",
+                .read_u64 = cpuacct_powerusage_read
+        },
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9405,6 +9508,10 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        for (; ca; ca = ca->parent) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
+                /* Call back into platform code to account for CPU speeds */
+                if (ca->cpufreq_fn && ca->cpufreq_fn->charge)
+                        ca->cpufreq_fn->charge(ca->cpuacct_data, cputime, cpu);
        }
        rcu_read_unlock();
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfc..c2f0e7248dc 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
        int                     nice;
 };
+static inline bool task_group_is_autogroup(struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 334eb474af9..22999b257ad 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-        return cfs_rq->tg->cfs_rq[this_cpu];
-}
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return NULL;
 }
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-        return &cpu_rq(this_cpu)->cfs;
-}
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
        return (s64)(a->vruntime - b->vruntime) < 0;
 }
-static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        return se->vruntime - cfs_rq->min_vruntime;
-}
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
        u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
        struct rb_node *parent = NULL;
        struct sched_entity *entry;
-        s64 key = entity_key(cfs_rq, se);
        int leftmost = 1;
        /*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * We dont care about collisions. Nodes with
                 * the same key stay together.
                 */
-                if (key < entity_key(cfs_rq, entry)) {
+                if (entity_before(se, entry)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         */
                        if (task_sleep && parent_entity(se))
                                set_next_buddy(parent_entity(se));
+                        /* avoid re-evaluating load for this entity */
+                        se = parent_entity(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
        }
        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * effect of the currently running task from the load
         * of the current CPU:
         */
-        rcu_read_lock();
        if (sync) {
                tg = task_group(current);
                weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                balanced = this_eff_load <= prev_eff_load;
        } else
                balanced = true;
-        rcu_read_unlock();
        /*
         * If the currently running task will sleep within
@@ -1924,8 +1906,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
-        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
+        update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
        if (wakeup_preempt_entity(se, pse) == 1) {
                /*
@@ -2234,11 +2216,43 @@ static void update_shares(int cpu)
        struct rq *rq = cpu_rq(cpu);
        rcu_read_lock();
+        /*
+         * Iterates the task_group tree in a bottom up fashion, see
+         * list_add_leaf_cfs_rq() for details.
+         */
        for_each_leaf_cfs_rq(rq, cfs_rq)
                update_shares_cpu(cfs_rq->tg, cpu);
        rcu_read_unlock();
 }
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+        unsigned long load;
+        long cpu = (long)data;
+        if (!tg->parent) {
+                load = cpu_rq(cpu)->load.weight;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->se[cpu]->load.weight;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
+        tg->cfs_rq[cpu]->h_load = load;
+        return 0;
+}
+static void update_h_load(long cpu)
+{
+        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -2246,14 +2260,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  int *all_pinned)
 {
        long rem_load_move = max_load_move;
-        int busiest_cpu = cpu_of(busiest);
+        struct cfs_rq *busiest_cfs_rq;
-        struct task_group *tg;
        rcu_read_lock();
-        update_h_load(busiest_cpu);
+        update_h_load(cpu_of(busiest));
-        list_for_each_entry_rcu(tg, &task_groups, list) {
+        for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
                u64 rem_load, moved_load;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1e7066d76c2..2e74677cb04 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
 /*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
 */
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
 /*
 * Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index db04161fe37..b827550a0d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -187,11 +187,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 typedef struct task_group *rt_rq_iter_t;
-#define for_each_rt_rq(rt_rq, iter, rq) \
+static inline struct task_group *next_task_group(struct task_group *tg)
-        for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
+{
-             (&iter->list != &task_groups) && \
+        do {
-             (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+                tg = list_entry_rcu(tg->list.next,
-             iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+                        typeof(struct task_group), list);
+        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+        if (&tg->list == &task_groups)
+                tg = NULL;
+        return tg;
+}
+#define for_each_rt_rq(rt_rq, iter, rq)                                 \
+        for (iter = container_of(&task_groups, typeof(*iter), list);    \
+                (iter = next_task_group(iter)) &&                       \
+                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 {
@@ -1045,7 +1057,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         */
        if (curr && unlikely(rt_task(curr)) &&
            (curr->rt.nr_cpus_allowed < 2 ||
-             curr->prio < p->prio) &&
+             curr->prio <= p->prio) &&
            (p->rt.nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
@@ -1133,7 +1145,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        rt_rq = &rq->rt;
-        if (unlikely(!rt_rq->rt_nr_running))
+        if (!rt_rq->rt_nr_running)
                return NULL;
        if (rt_rq_throttled(rt_rq))
@@ -1555,7 +1567,7 @@ skip:
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+        if (rq->rt.highest_prio.curr > prev->prio)
                pull_rt_task(rq);
 }
@@ -1576,7 +1588,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
            p->rt.nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
            (rq->curr->rt.nr_cpus_allowed < 2 ||
-             rq->curr->prio < p->prio))
+             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 415d85d6f6c..195331c56ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
        /*
         * Tracers may want to know about even ignored signals.
         */
-        return !tracehook_consider_ignored_signal(t, sig);
+        return !t->ptrace;
 }
 /*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-        if ((t->group_stop & GROUP_STOP_PENDING) ||
+        if ((t->jobctl & JOBCTL_PENDING_MASK) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 void recalc_sigpending(void)
 {
-        if (unlikely(tracehook_force_sigpending()))
+        if (!recalc_sigpending_tsk(current) && !freezing(current))
-                set_thread_flag(TIF_SIGPENDING);
-        else if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);
 }
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig)
 }
 /**
- * task_clear_group_stop_trapping - clear group stop trapping bit
+ * task_set_jobctl_pending - set jobctl pending bits
 * @task: target task
+ * @mask: pending bits to set
 *
- * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us.  Clear it
+ * Clear @mask from @task->jobctl.  @mask must be subset of
- * and wake up the ptracer.  Note that we don't need any further locking.
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
- * @task->siglock guarantees that @task->parent points to the ptracer.
+ * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
+ * cleared.  If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+                return false;
+        if (mask & JOBCTL_STOP_SIGMASK)
+                task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+        task->jobctl |= mask;
+        return true;
+}
+/**
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
+ * @task: target task
+ *
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer.  Note that we don't need any further
+ * locking.  @task->siglock guarantees that @task->parent points to the
+ * ptracer.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
-static void task_clear_group_stop_trapping(struct task_struct *task)
+void task_clear_jobctl_trapping(struct task_struct *task)
 {
-        if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
+        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
-                task->group_stop &= ~GROUP_STOP_TRAPPING;
+                task->jobctl &= ~JOBCTL_TRAPPING;
-                __wake_up_sync_key(&task->parent->signal->wait_chldexit,
+                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
-                                   TASK_UNINTERRUPTIBLE, 1, task);
        }
 }
 /**
- * task_clear_group_stop_pending - clear pending group stop
+ * task_clear_jobctl_pending - clear jobctl pending bits
 * @task: target task
+ * @mask: pending bits to clear
 *
- * Clear group stop states for @task.
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
+ *
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
-void task_clear_group_stop_pending(struct task_struct *task)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
 {
-        task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
+        BUG_ON(mask & ~JOBCTL_PENDING_MASK);
-                              GROUP_STOP_DEQUEUED);
+        if (mask & JOBCTL_STOP_PENDING)
+                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+        task->jobctl &= ~mask;
+        if (!(task->jobctl & JOBCTL_PENDING_MASK))
+                task_clear_jobctl_trapping(task);
 }
 /**
 * task_participate_group_stop - participate in a group stop
 * @task: task participating in a group stop
 *
- * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
 * Group stop states are cleared and the group stop count is consumed if
- * %GROUP_STOP_CONSUME was set.  If the consumption completes the group
+ * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
 * stop, the appropriate %SIGNAL_* flags are set.
 *
 * CONTEXT:
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
 static bool task_participate_group_stop(struct task_struct *task)
 {
        struct signal_struct *sig = task->signal;
-        bool consume = task->group_stop & GROUP_STOP_CONSUME;
+        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
-        WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
-        task_clear_group_stop_pending(task);
+        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
        if (!consume)
                return false;
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
                return 1;
        if (handler != SIG_IGN && handler != SIG_DFL)
                return 0;
-        return !tracehook_consider_fatal_signal(tsk, sig);
+        /* if ptraced, let the tracer determine */
+        return !tsk->ptrace;
 }
 /*
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                current->group_stop |= GROUP_STOP_DEQUEUED;
+                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }
        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
                /*
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
        return security_task_kill(t, info, sig, 0);
 }
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken.  If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event.  If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+        assert_spin_locked(&t->sighand->siglock);
+        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+        signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+}
 /*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
                rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
                t = p;
                do {
-                        task_clear_group_stop_pending(t);
+                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-                        wake_up_state(t, __TASK_STOPPED);
+                        if (likely(!(t->ptrace & PT_SEIZED)))
+                                wake_up_state(t, __TASK_STOPPED);
+                        else
+                                ptrace_trap_notify(t);
                } while_each_thread(p, t);
                /*
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
        if (sig_fatal(p, sig) &&
            !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
-            (sig == SIGKILL ||
+            (sig == SIGKILL || !t->ptrace)) {
-             !tracehook_consider_fatal_signal(t, sig))) {
                /*
                 * This signal will be fatal to the whole group.
                 */
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
                        signal->group_stop_count = 0;
                        t = p;
                        do {
-                                task_clear_group_stop_pending(t);
+                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        } while_each_thread(p, t);
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p)
        p->signal->group_stop_count = 0;
        while_each_thread(p, t) {
-                task_clear_group_stop_pending(t);
+                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;
                /* Don't bother with already dead threads */
@@ -1511,22 +1584,22 @@ ret:
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
- * Returns -1 if our parent ignored us and so we've switched to
+ * Returns true if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * self-reaping.
 */
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
 {
        struct siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
-        int ret = sig;
+        bool autoreap = false;
        BUG_ON(sig == -1);
        /* do_notify_parent_cldstop should have been called instead.  */
        BUG_ON(task_is_stopped_or_traced(tsk));
-        BUG_ON(!task_ptrace(tsk) &&
+        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
        info.si_signo = sig;
@@ -1565,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
-        if (!task_ptrace(tsk) && sig == SIGCHLD &&
+        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
@@ -1583,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig)
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
-                ret = tsk->exit_signal = -1;
+                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-                        sig = -1;
+                        sig = 0;
        }
-        if (valid_signal(sig) && sig > 0)
+        if (valid_signal(sig) && sig)
                __group_send_sig_info(sig, &info, tsk->parent);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);
-        return ret;
+        return autoreap;
 }
 /**
@@ -1665,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 static inline int may_ptrace_stop(void)
 {
-        if (!likely(task_ptrace(current)))
+        if (!likely(current->ptrace))
                return 0;
        /*
         * Are we in the middle of do_coredump?
@@ -1694,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk)
 }
 /*
- * Test whether the target task of the usual cldstop notification - the
- * real_parent of @child - is in the same group as the ptracer.
- */
-static bool real_parent_is_ptracer(struct task_struct *child)
-{
-        return same_thread_group(child->parent, child->real_parent);
-}
-/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
@@ -1739,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
        }
        /*
-         * If @why is CLD_STOPPED, we're trapping to participate in a group
+         * We're committing to trapping.  TRACED should be visible before
-         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
+         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
-         * while siglock was released for the arch hook, PENDING could be
+         * Also, transition to TRACED and updates to ->jobctl should be
-         * clear now.  We act as if SIGCONT is received after TASK_TRACED
+         * atomic with respect to siglock and should be done after the arch
-         * is entered - ignore it.
+         * hook as siglock is released and regrabbed across it.
         */
-        if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
+        set_current_state(TASK_TRACED);
-                gstop_done = task_participate_group_stop(current);
        current->last_siginfo = info;
        current->exit_code = exit_code;
        /*
-         * TRACED should be visible before TRAPPING is cleared; otherwise,
+         * If @why is CLD_STOPPED, we're trapping to participate in a group
-         * the tracer might fail do_wait().
+         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
+         * across siglock relocks since INTERRUPT was scheduled, PENDING
+         * could be clear now.  We act as if SIGCONT is received after
+         * TASK_TRACED is entered - ignore it.
         */
-        set_current_state(TASK_TRACED);
+        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
+                gstop_done = task_participate_group_stop(current);
-        /*
+        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
-         * We're committing to trapping.  Clearing GROUP_STOP_TRAPPING and
+        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
-         * transition to TASK_TRACED should be atomic with respect to
+        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
-         * siglock.  This hsould be done after the arch hook as siglock is
+                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
-         * released and regrabbed across it.
-         */
+        /* entering a trap, clear TRAPPING */
-        task_clear_group_stop_trapping(current);
+        task_clear_jobctl_trapping(current);
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
@@ -1779,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
                 * separately unless they're gonna be duplicates.
                 */
                do_notify_parent_cldstop(current, true, why);
-                if (gstop_done && !real_parent_is_ptracer(current))
+                if (gstop_done && ptrace_reparented(current))
                        do_notify_parent_cldstop(current, false, why);
                /*
@@ -1799,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
                 *
                 * If @gstop_done, the ptracer went away between group stop
                 * completion and here.  During detach, it would have set
-                 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
+                 * JOBCTL_STOP_PENDING on us and we'll re-enter
-                 * in do_signal_stop() on return, so notifying the real
+                 * TASK_STOPPED in do_signal_stop() on return, so notifying
-                 * parent of the group stop completion is enough.
+                 * the real parent of the group stop completion is enough.
                 */
                if (gstop_done)
                        do_notify_parent_cldstop(current, false, why);
@@ -1827,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
        spin_lock_irq(&current->sighand->siglock);
        current->last_siginfo = NULL;
+        /* LISTENING can be set only during STOP traps, clear it */
+        current->jobctl &= ~JOBCTL_LISTENING;
        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
@@ -1835,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
        recalc_sigpending_tsk(current);
 }
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
 {
        siginfo_t info;
-        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        memset(&info, 0, sizeof info);
-        info.si_signo = SIGTRAP;
+        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = current_uid();
        /* Let the debugger run.  */
+        ptrace_stop(exit_code, why, 1, &info);
+}
+void ptrace_notify(int exit_code)
+{
+        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        spin_lock_irq(&current->sighand->siglock);
-        ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
+        ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
        spin_unlock_irq(&current->sighand->siglock);
 }
-/*
+/**
- * This performs the stopping for SIGSTOP and other stop signals.
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
- * We have to stop all threads in the thread group.
+ * @signr: signr causing group stop if initiating
- * Returns non-zero if we've actually stopped and released the siglock.
+ *
- * Returns zero if we didn't stop and still hold the siglock.
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it.  If already set, participate in the existing
+ * group stop.  If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself.  Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched.  The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
 */
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+        __releases(&current->sighand->siglock)
 {
        struct signal_struct *sig = current->signal;
-        if (!(current->group_stop & GROUP_STOP_PENDING)) {
+        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
-                unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
+                unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;
-                /* signr will be recorded in task->group_stop for retries */
+                /* signr will be recorded in task->jobctl for retries */
-                WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
-                if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
+                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
-                        return 0;
+                        return false;
                /*
                 * There is no group stop already in progress.  We must
                 * initiate one now.
@@ -1894,29 +1986,31 @@ static int do_signal_stop(int signr)
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;
-                else
-                        WARN_ON_ONCE(!task_ptrace(current));
-                current->group_stop &= ~GROUP_STOP_SIGMASK;
+                sig->group_stop_count = 0;
-                current->group_stop |= signr | gstop;
-                sig->group_stop_count = 1;
+                if (task_set_jobctl_pending(current, signr | gstop))
+                        sig->group_stop_count++;
                for (t = next_thread(current); t != current;
                     t = next_thread(t)) {
-                        t->group_stop &= ~GROUP_STOP_SIGMASK;
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
-                        if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
+                        if (!task_is_stopped(t) &&
-                                t->group_stop |= signr | gstop;
+                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
-                                signal_wake_up(t, 0);
+                                if (likely(!(t->ptrace & PT_SEIZED)))
+                                        signal_wake_up(t, 0);
+                                else
+                                        ptrace_trap_notify(t);
                        }
                }
        }
-retry:
-        if (likely(!task_ptrace(current))) {
+        if (likely(!current->ptrace)) {
                int notify = 0;
                /*
@@ -1947,43 +2041,65 @@ retry:
                /* Now we don't run again until woken by SIGCONT or SIGKILL */
                schedule();
+                return true;
-                spin_lock_irq(&current->sighand->siglock);
        } else {
-                ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
+                /*
-                            CLD_STOPPED, 0, NULL);
+                 * While ptraced, group stop is handled by STOP trap.
-                current->exit_code = 0;
+                 * Schedule it and let the caller deal with it.
+                 */
+                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+                return false;
        }
+}
-        /*
+/**
-         * GROUP_STOP_PENDING could be set if another group stop has
+ * do_jobctl_trap - take care of ptrace jobctl traps
-         * started since being woken up or ptrace wants us to transit
+ *
-         * between TASK_STOPPED and TRACED.  Retry group stop.
+ * When PT_SEIZED, it's used for both group stop and explicit
-         */
+ * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
-        if (current->group_stop & GROUP_STOP_PENDING) {
+ * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
-                WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
+ * the stop signal; otherwise, %SIGTRAP.
-                goto retry;
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+        struct signal_struct *signal = current->signal;
+        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
+        if (current->ptrace & PT_SEIZED) {
+                if (!signal->group_stop_count &&
+                    !(signal->flags & SIGNAL_STOP_STOPPED))
+                        signr = SIGTRAP;
+                WARN_ON_ONCE(!signr);
+                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+                                 CLD_STOPPED);
+        } else {
+                WARN_ON_ONCE(!signr);
+                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+                current->exit_code = 0;
        }
-        /* PTRACE_ATTACH might have raced with task killing, clear trapping */
-        task_clear_group_stop_trapping(current);
-        spin_unlock_irq(&current->sighand->siglock);
-        tracehook_finish_jctl();
-        return 1;
 }
 static int ptrace_signal(int signr, siginfo_t *info,
                         struct pt_regs *regs, void *cookie)
 {
-        if (!task_ptrace(current))
-                return signr;
        ptrace_signal_deliver(regs, cookie);
+        /*
-        /* Let the debugger run.  */
+         * We do not check sig_kernel_stop(signr) but set this marker
+         * unconditionally because we do not know whether debugger will
+         * change signr. This flag has no meaning unless we are going
+         * to stop after return from ptrace_stop(). In this case it will
+         * be checked in do_signal_stop(), we should only stop if it was
+         * not cleared by SIGCONT while we were sleeping. See also the
+         * comment in dequeue_signal().
+         */
+        current->jobctl |= JOBCTL_STOP_DEQUEUED;
        ptrace_stop(signr, CLD_TRAPPED, 0, info);
        /* We're back.  Did the debugger cancel the sig?  */
@@ -2039,7 +2155,6 @@ relock:
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
-                struct task_struct *leader;
                int why;
                if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2060,13 +2175,11 @@ relock:
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, false, why);
-                leader = current->group_leader;
+                if (ptrace_reparented(current->group_leader))
-                if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
+                        do_notify_parent_cldstop(current->group_leader,
-                        do_notify_parent_cldstop(leader, true, why);
+                                                true, why);
                read_unlock(&tasklist_lock);
                goto relock;
@@ -2074,37 +2187,31 @@ relock:
        for (;;) {
                struct k_sigaction *ka;
-                /*
-                 * Tracing can induce an artificial signal and choose sigaction.
+                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
-                 * The return value in @signr determines the default action,
+                    do_signal_stop(0))
-                 * but @info->si_signo is the signal number we will report.
-                 */
-                signr = tracehook_get_signal(current, regs, info, return_ka);
-                if (unlikely(signr < 0))
                        goto relock;
-                if (unlikely(signr != 0))
-                        ka = return_ka;
-                else {
-                        if (unlikely(current->group_stop &
-                                     GROUP_STOP_PENDING) && do_signal_stop(0))
-                                goto relock;
-                        signr = dequeue_signal(current, &current->blocked,
+                if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
-                                               info);
+                        do_jobctl_trap();
+                        spin_unlock_irq(&sighand->siglock);
+                        goto relock;
+                }
-                        if (!signr)
+                signr = dequeue_signal(current, &current->blocked, info);
-                                break; /* will return 0 */
-                        if (signr != SIGKILL) {
+                if (!signr)
-                                signr = ptrace_signal(signr, info,
+                        break; /* will return 0 */
-                                                      regs, cookie);
-                                if (!signr)
-                                        continue;
-                        }
-                        ka = &sighand->action[signr-1];
+                if (unlikely(current->ptrace) && signr != SIGKILL) {
+                        signr = ptrace_signal(signr, info,
+                                              regs, cookie);
+                        if (!signr)
+                                continue;
                }
+                ka = &sighand->action[signr-1];
                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, info, ka);
@@ -2260,7 +2367,7 @@ void exit_signals(struct task_struct *tsk)
        signotset(&unblocked);
        retarget_shared_pending(tsk, &unblocked);
-        if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
            task_participate_group_stop(tsk))
                group_stop = CLD_STOPPED;
 out:
@@ -2993,15 +3100,11 @@ SYSCALL_DEFINE0(sgetmask)
 SYSCALL_DEFINE1(ssetmask, int, newmask)
 {
-        int old;
+        int old = current->blocked.sig[0];
+        sigset_t newset;
-        spin_lock_irq(&current->sighand->siglock);
-        old = current->blocked.sig[0];
-        siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
+        siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
-                                                  sigmask(SIGSTOP)));
+        set_current_blocked(&newset);
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        return old;
 }
@@ -3058,11 +3161,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
                return -EFAULT;
        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-        spin_lock_irq(&current->sighand->siglock);
        current->saved_sigmask = current->blocked;
-        current->blocked = newset;
+        set_current_blocked(&newset);
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        current->state = TASK_INTERRUPTIBLE;
        schedule();
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc..d20c6983aad 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 EXPORT_SYMBOL_GPL(print_stack_trace);
 /*
- * Architectures that do not implement save_stack_trace_tsk get this
+ * Architectures that do not implement save_stack_trace_tsk or
- * weak alias and a once-per-bootup warning (whenever this facility
+ * save_stack_trace_regs get this weak alias and a once-per-bootup warning
- * is utilized - for example by procfs):
+ * (whenever this facility is utilized - for example by procfs):
 */
 __weak void
 save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
        WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
 }
+__weak void
+save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+        WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
+}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076..ba5070ce576 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -19,7 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * Structure to determine completion condition and record errors.  May
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 static DEFINE_MUTEX(stop_cpus_mutex);
 static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
-int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+static void queue_stop_cpus_work(const struct cpumask *cpumask,
+                                 cpu_stop_fn_t fn, void *arg,
+                                 struct cpu_stop_done *done)
 {
        struct cpu_stop_work *work;
-        struct cpu_stop_done done;
        unsigned int cpu;
        /* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
                work = &per_cpu(stop_cpus_work, cpu);
                work->fn = fn;
                work->arg = arg;
-                work->done = &done;
+                work->done = done;
        }
-        cpu_stop_init_done(&done, cpumask_weight(cpumask));
        /*
         * Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
                cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
                                    &per_cpu(stop_cpus_work, cpu));
        preempt_enable();
+}
+static int __stop_cpus(const struct cpumask *cpumask,
+                       cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        cpu_stop_init_done(&done, cpumask_weight(cpumask));
+        queue_stop_cpus_work(cpumask, fn, arg, &done);
        wait_for_completion(&done.completion);
        return done.executed ? done.ret : -ENOENT;
 }
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
        struct stop_machine_data *smdata = data;
        enum stopmachine_state curstate = STOPMACHINE_NONE;
        int cpu = smp_processor_id(), err = 0;
+        unsigned long flags;
        bool is_active;
+        /*
+         * When called from stop_machine_from_inactive_cpu(), irq might
+         * already be disabled.  Save the state and restore it on exit.
+         */
+        local_save_flags(flags);
        if (!smdata->active_cpus)
                is_active = cpu == cpumask_first(cpu_online_mask);
        else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
                }
        } while (curstate != STOPMACHINE_EXIT);
-        local_irq_enable();
+        local_irq_restore(flags);
        return err;
 }
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+/**
+ * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * This is identical to stop_machine() but can be called from a CPU which
+ * is not active.  The local CPU is in the process of hotplug (so no other
+ * CPU hotplug can start) and not marked active and doesn't have enough
+ * context to sleep.
+ *
+ * This function provides stop_machine() functionality for such state by
+ * using busy-wait for synchronization and executing @fn directly for local
+ * CPU.
+ *
+ * CONTEXT:
+ * Local CPU is inactive.  Temporarily stops all active CPUs.
+ *
+ * RETURNS:
+ * 0 if all executions of @fn returned 0, any non zero return value if any
+ * returned non zero.
+ */
+int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+                                  const struct cpumask *cpus)
+{
+        struct stop_machine_data smdata = { .fn = fn, .data = data,
+                                            .active_cpus = cpus };
+        struct cpu_stop_done done;
+        int ret;
+        /* Local CPU must be inactive and CPU hotplug in progress. */
+        BUG_ON(cpu_active(raw_smp_processor_id()));
+        smdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
+        /* No proper task established and can't sleep - busy wait for lock. */
+        while (!mutex_trylock(&stop_cpus_mutex))
+                cpu_relax();
+        /* Schedule work on other CPUs and execute directly for local CPU */
+        set_state(&smdata, STOPMACHINE_PREPARE);
+        cpu_stop_init_done(&done, num_active_cpus());
+        queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+                             &done);
+        ret = stop_machine_cpu_stop(&smdata);
+        /* Busy wait for completion. */
+        while (!completion_done(&done.completion))
+                cpu_relax();
+        mutex_unlock(&stop_cpus_mutex);
+        return ret ?: done.ret;
+}
 #endif  /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b278f2..1dbbe695a5e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
 #include <linux/highuid.h>
@@ -38,6 +37,8 @@
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
+#include <linux/version.h>
+#include <linux/ctype.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -45,6 +46,8 @@
 #include <linux/user_namespace.h>
 #include <linux/kmsg_dump.h>
+/* Move somewhere else to avoid recompiling? */
+#include <generated/utsrelease.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -320,6 +323,37 @@ void kernel_restart_prepare(char *cmd)
 }
 /**
+ *      register_reboot_notifier - Register function to be called at reboot time
+ *      @nb: Info about notifier function to be called
+ *
+ *      Registers a function with the list of functions
+ *      to be called at reboot time.
+ *
+ *      Currently always returns zero, as blocking_notifier_chain_register()
+ *      always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+/**
+ *      unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *      @nb: Hook to be unregistered
+ *
+ *      Unregisters a previously registered reboot
+ *      notifier function.
+ *
+ *      Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+/**
 *      kernel_restart - reboot the system
 *      @cmd: pointer to buffer containing command to execute for restart
 *              or %NULL
@@ -591,11 +625,18 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
+        /*
+         * We don't fail in case of NPROC limit excess here because too many
+         * poorly written programs don't check set*uid() return code, assuming
+         * it never fails if called by root.  We may still enforce NPROC limit
+         * for programs doing set*uid()+execve() by harmlessly deferring the
+         * failure to the execve() stage.
+         */
        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                        new_user != INIT_USER) {
+                        new_user != INIT_USER)
-                free_uid(new_user);
+                current->flags |= PF_NPROC_EXCEEDED;
-                return -EAGAIN;
+        else
-        }
+                current->flags &= ~PF_NPROC_EXCEEDED;
        free_uid(new->user);
        new->user = new_user;
@@ -1124,6 +1165,34 @@ DECLARE_RWSEM(uts_sem);
 #define override_architecture(name)     0
 #endif
+/*
+ * Work around broken programs that cannot handle "Linux 3.0".
+ * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
+ */
+static int override_release(char __user *release, int len)
+{
+        int ret = 0;
+        char buf[65];
+        if (current->personality & UNAME26) {
+                char *rest = UTS_RELEASE;
+                int ndots = 0;
+                unsigned v;
+                while (*rest) {
+                        if (*rest == '.' && ++ndots >= 3)
+                                break;
+                        if (!isdigit(*rest) && *rest != '.')
+                                break;
+                        rest++;
+                }
+                v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
+                snprintf(buf, len, "2.6.%u%s", v, rest);
+                ret = copy_to_user(release, buf, len);
+        }
+        return ret;
+}
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1133,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_release(name->release, sizeof(name->release)))
+                errno = -EFAULT;
        if (!errno && override_architecture(name))
                errno = -EFAULT;
        return errno;
@@ -1154,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
                error = -EFAULT;
        up_read(&uts_sem);
+        if (!error && override_release(name->release, sizeof(name->release)))
+                error = -EFAULT;
        if (!error && override_architecture(name))
                error = -EFAULT;
        return error;
@@ -1188,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
        if (!error && override_architecture(name))
                error = -EFAULT;
+        if (!error && override_release(name->release, sizeof(name->release)))
+                error = -EFAULT;
        return error ? -EFAULT : 0;
 }
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 62cbc8877fe..a9a5de07c4f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void)
        return -ENOSYS;
 }
-cond_syscall(sys_nfsservctl);
 cond_syscall(sys_quotactl);
 cond_syscall(sys32_quotactl);
 cond_syscall(sys_acct);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98bd35..fd15163f360 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,7 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int min_free_order_shift;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1189,6 +1190,13 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
+                .procname       = "min_free_order_shift",
+                .data           = &min_free_order_shift,
+                .maxlen         = sizeof(min_free_order_shift),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
                .procname       = "percpu_pagelist_fraction",
                .data           = &percpu_pagelist_fraction,
                .maxlen         = sizeof(percpu_pagelist_fraction),
@@ -1590,16 +1598,11 @@ void sysctl_head_get(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
-static void free_head(struct rcu_head *rcu)
-{
-        kfree(container_of(rcu, struct ctl_table_header, rcu));
-}
 void sysctl_head_put(struct ctl_table_header *head)
 {
        spin_lock(&sysctl_lock);
        if (!--head->count)
-                call_rcu(&head->rcu, free_head);
+                kfree_rcu(head, rcu);
        spin_unlock(&sysctl_lock);
 }
@@ -1971,10 +1974,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        start_unregistering(header);
        if (!--header->parent->count) {
                WARN_ON(1);
-                call_rcu(&header->parent->rcu, free_head);
+                kfree_rcu(header->parent, rcu);
        }
        if (!--header->count)
-                call_rcu(&header->rcu, free_head);
+                kfree_rcu(header, rcu);
        spin_unlock(&sysctl_lock);
 }
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 3b8e028b960..2ce1b308672 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1,6 +1,6 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
 #include <net/ip_vs.h>
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        fput(file);
 out_putname:
-        putname(pathname);
+        __putname(pathname);
 out:
        return result;
 }
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 4e4932a7b36..362da653813 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,6 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
 #include <net/ip_vs.h>
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index fc0f2200541..e66046456f4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -28,7 +28,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <net/genetlink.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * Maximum length of a cpumask that can be specified in
@@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
-        s = NULL;
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
-                        if (!s)
+                        s = kmalloc_node(sizeof(struct listener),
-                                s = kmalloc_node(sizeof(struct listener),
+                                        GFP_KERNEL, cpu_to_node(cpu));
-                                                 GFP_KERNEL, cpu_to_node(cpu));
                        if (!s)
                                goto cleanup;
                        s->pid = pid;
-                        INIT_LIST_HEAD(&s->list);
                        s->valid = 1;
                        listeners = &per_cpu(listener_array, cpu);
                        down_write(&listeners->sem);
-                        list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
+                        list_for_each_entry(s2, &listeners->list, list) {
-                                if (s2->pid == pid)
+                                if (s2->pid == pid && s2->valid)
-                                        goto next_cpu;
+                                        goto exists;
                        }
                        list_add(&s->list, &listeners->list);
                        s = NULL;
-next_cpu:
+exists:
                        up_write(&listeners->sem);
+                        kfree(s); /* nop if NULL */
                }
-                kfree(s);
                return 0;
        }
@@ -657,6 +655,7 @@ static struct genl_ops taskstats_ops = {
        .cmd            = TASKSTATS_CMD_GET,
        .doit           = taskstats_user_cmd,
        .policy         = taskstats_cmd_get_policy,
+        .flags          = GENL_ADMIN_PERM,
 };
 static struct genl_ops cgroupstats_ops = {
diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d705c..d7760621452 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval);
 /*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */
-clock_t jiffies_to_clock_t(long x)
+clock_t jiffies_to_clock_t(unsigned long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c..cae2ad7491b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o posix-clock.o #alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 59f369f98a0..8b70c76910a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -181,7 +181,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
                struct alarm *alarm;
                ktime_t expired = next->expires;
-                if (expired.tv64 >= now.tv64)
+                if (expired.tv64 > now.tv64)
                        break;
                alarm = container_of(next, struct alarm, node);
@@ -441,6 +441,8 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 static void alarm_timer_get(struct k_itimer *timr,
                                struct itimerspec *cur_setting)
 {
+        memset(cur_setting, 0, sizeof(struct itimerspec));
        cur_setting->it_interval =
                        ktime_to_timespec(timr->it.alarmtimer.period);
        cur_setting->it_value =
@@ -479,11 +481,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        if (!rtcdev)
                return -ENOTSUPP;
-        /* Save old values */
+        /*
-        old_setting->it_interval =
+         * XXX HACK! Currently we can DOS a system if the interval
-                        ktime_to_timespec(timr->it.alarmtimer.period);
+         * period on alarmtimers is too small. Cap the interval here
-        old_setting->it_value =
+         * to 100us and solve this properly in a future patch! -jstultz
-                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
+         */
+        if ((new_setting->it_interval.tv_sec == 0) &&
+                        (new_setting->it_interval.tv_nsec < 100000))
+                new_setting->it_interval.tv_nsec = 100000;
+        if (old_setting)
+                alarm_timer_get(timr, old_setting);
        /* If the timer was already set, cancel it */
        alarm_cancel(&timr->it.alarmtimer);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0d9a0..8f77da18fef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -494,6 +494,22 @@ void clocksource_touch_watchdog(void)
 }
 /**
+ * clocksource_max_adjustment- Returns max adjustment amount
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u32 clocksource_max_adjustment(struct clocksource *cs)
+{
+        u64 ret;
+        /*
+         * We won't try to correct for more then 11% adjustments (110,000 ppm),
+         */
+        ret = (u64)cs->mult * 11;
+        do_div(ret,100);
+        return (u32)ret;
+}
+/**
 * clocksource_max_deferment - Returns max time the clocksource can be deferred
 * @cs:         Pointer to clocksource
 *
@@ -505,25 +521,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
        /*
         * Calculate the maximum number of cycles that we can pass to the
         * cyc2ns function without overflowing a 64-bit signed result. The
-         * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+         * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
-         * is equivalent to the below.
+         * which is equivalent to the below.
-         * max_cycles < (2^63)/cs->mult
+         * max_cycles < (2^63)/(cs->mult + cs->maxadj)
-         * max_cycles < 2^(log2((2^63)/cs->mult))
+         * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
-         * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+         * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
-         * max_cycles < 2^(63 - log2(cs->mult))
+         * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
-         * max_cycles < 1 << (63 - log2(cs->mult))
+         * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
         * Please note that we add 1 to the result of the log2 to account for
         * any rounding errors, ensure the above inequality is satisfied and
         * no overflow will occur.
         */
-        max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+        max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
        /*
         * The actual maximum number of cycles we can defer the clocksource is
         * determined by the minimum of max_cycles and cs->mask.
+         * Note: Here we subtract the maxadj to make sure we don't sleep for
+         * too long if there's a large negative adjustment.
         */
        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
-        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+                                        cs->shift);
        /*
         * To ensure that the clocksource does not wrap whilst we are idle,
@@ -531,7 +550,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
         * note a margin of 12.5% is used because this can be computed with
         * a shift, versus say 10% which would require division.
         */
-        return max_nsecs - (max_nsecs >> 5);
+        return max_nsecs - (max_nsecs >> 3);
 }
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -642,7 +661,6 @@ static void clocksource_enqueue(struct clocksource *cs)
 void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        u64 sec;
        /*
         * Calc the maximum number of seconds which we can run before
         * wrapping around. For clocksources which have a mask > 32bit
@@ -653,7 +671,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
         * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
         * margin as we do in clocksource_max_deferment()
         */
-        sec = (cs->mask - (cs->mask >> 5));
+        sec = (cs->mask - (cs->mask >> 3));
        do_div(sec, freq);
        do_div(sec, scale);
        if (!sec)
@@ -663,6 +681,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
                               NSEC_PER_SEC / scale, sec * scale);
+        /*
+         * for clocksources that have large mults, to avoid overflow.
+         * Since mult may be adjusted by ntp, add an safety extra margin
+         *
+         */
+        cs->maxadj = clocksource_max_adjustment(cs);
+        while ((cs->mult + cs->maxadj < cs->mult)
+                || (cs->mult - cs->maxadj > cs->mult)) {
+                cs->mult >>= 1;
+                cs->shift--;
+                cs->maxadj = clocksource_max_adjustment(cs);
+        }
        cs->max_idle_ns = clocksource_max_deferment(cs);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -703,6 +735,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 */
 int clocksource_register(struct clocksource *cs)
 {
+        /* calculate max adjustment for given mult/shift */
+        cs->maxadj = clocksource_max_adjustment(cs);
+        WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+                "Clocksource %s might overflow on 11%% adjustment\n",
+                cs->name);
        /* calculate max idle time permitted for this clocksource */
        cs->max_idle_ns = clocksource_max_deferment(cs);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d13273..7a90d021b79 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
             (dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
-        clockevents_exchange_device(NULL, dev);
+        clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
        tick_broadcast_device.evtdev = dev;
        if (!cpumask_empty(tick_get_broadcast_mask()))
                tick_broadcast_start_periodic(dev);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 342408cf68d..6f9798bf240 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -249,6 +249,8 @@ ktime_t ktime_get(void)
                secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
                nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
                nsecs += timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&xtime_lock, seq));
        /*
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)
                *ts = xtime;
                tomono = wall_to_monotonic;
                nsecs = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&xtime_lock, seq));
@@ -604,6 +608,12 @@ static struct timespec timekeeping_suspend_time;
 */
 static void __timekeeping_inject_sleeptime(struct timespec *delta)
 {
+        if (!timespec_valid(delta)) {
+                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
+                                        "sleep delta value!\n");
+                return;
+        }
        xtime = timespec_add(xtime, *delta);
        wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
        total_sleep_time = timespec_add(total_sleep_time, *delta);
@@ -686,12 +696,34 @@ static void timekeeping_resume(void)
 static int timekeeping_suspend(void)
 {
        unsigned long flags;
+        struct timespec         delta, delta_delta;
+        static struct timespec  old_delta;
        read_persistent_clock(&timekeeping_suspend_time);
        write_seqlock_irqsave(&xtime_lock, flags);
        timekeeping_forward_now();
        timekeeping_suspended = 1;
+        /*
+         * To avoid drift caused by repeated suspend/resumes,
+         * which each can add ~1 second drift error,
+         * try to compensate so the difference in system time
+         * and persistent_clock time stays close to constant.
+         */
+        delta = timespec_sub(xtime, timekeeping_suspend_time);
+        delta_delta = timespec_sub(delta, old_delta);
+        if (abs(delta_delta.tv_sec)  >= 2) {
+                /*
+                 * if delta_delta is too large, assume time correction
+                 * has occured and set old_delta to the current delta.
+                 */
+                old_delta = delta;
+        } else {
+                /* Otherwise try to adjust old_system to compensate */
+                timekeeping_suspend_time =
+                        timespec_add(timekeeping_suspend_time, delta_delta);
+        }
        write_sequnlock_irqrestore(&xtime_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -792,6 +824,13 @@ static void timekeeping_adjust(s64 offset)
        } else
                return;
+        WARN_ONCE(timekeeper.clock->maxadj &&
+                        (timekeeper.mult + adj > timekeeper.clock->mult +
+                                                timekeeper.clock->maxadj),
+                        "Adjusting %s more then 11%% (%ld vs %ld)\n",
+                        timekeeper.clock->name, (long)timekeeper.mult + adj,
+                        (long)timekeeper.clock->mult +
+                                timekeeper.clock->maxadj);
        timekeeper.mult += adj;
        timekeeper.xtime_interval += interval;
        timekeeper.xtime_nsec -= offset;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb..93168c0f991 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
          power:power_frequency
          This is for userspace compatibility
          and will vanish after 5 kernel iterations,
-          namely 2.6.41.
+          namely 3.1.
 config CONTEXT_SWITCH_TRACER
        bool
@@ -487,6 +487,39 @@ config RING_BUFFER_BENCHMARK
          If unsure, say N.
+config TRACELEVEL
+        bool "Add capability to prioritize traces"
+        depends on EVENT_TRACING
+        help
+          This option allows subsystem programmers to add priorities to trace
+          events by calling to tracelevel_register. Traces of high priority
+          will automatically be enabled on kernel boot, and users can change
+          the the trace level in a kernel parameter.
+config TRACEDUMP
+        bool "Dumping functionality for ftrace"
+        depends on FUNCTION_TRACER
+        help
+          This option adds functionality to dump tracing data in several forms
+          Data can be dumped in ascii form or as raw pages from the tracing
+          ring buffers, along with the saved cmdlines. This is specified by
+          the module parameter tracedump_ascii. Data will be compressed
+          using zlib.
+config TRACEDUMP_PANIC
+        bool "Tracedump to console on panic"
+        depends on TRACEDUMP
+        help
+          With this option, tracedump will automatically dump to the console
+          on a kernel panic.
+config TRACEDUMP_PROCFS
+        bool "Tracedump via proc file"
+        depends on TRACEDUMP
+        help
+          With this option, tracedump can be dumped from user space by reading
+          from /proc/tracedump.
 endif # FTRACE
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..1360a1a90d5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
+obj-$(CONFIG_TRACELEVEL) += tracelevel.o
+obj-$(CONFIG_TRACEDUMP) += tracedump.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6957aa298df..7c910a5593a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
        what |= MASK_TC_BIT(rw, DISCARD);
+        what |= MASK_TC_BIT(rw, FLUSH);
+        what |= MASK_TC_BIT(rw, FUA);
        pid = tsk->pid;
        if (act_log_check(bt, what, sector, pid))
@@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
                goto out;
        }
+        if (tc & BLK_TC_FLUSH)
+                rwbs[i++] = 'F';
        if (tc & BLK_TC_DISCARD)
                rwbs[i++] = 'D';
        else if (tc & BLK_TC_WRITE)
@@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
        else
                rwbs[i++] = 'N';
+        if (tc & BLK_TC_FUA)
+                rwbs[i++] = 'F';
        if (tc & BLK_TC_AHEAD)
                rwbs[i++] = 'A';
-        if (tc & BLK_TC_BARRIER)
-                rwbs[i++] = 'B';
        if (tc & BLK_TC_SYNC)
                rwbs[i++] = 'S';
        if (tc & BLK_TC_META)
@@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
-        char rwbs[6];
+        char rwbs[RWBS_LEN];
        unsigned long long ts  = iter->ts;
        unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
        unsigned secs          = (unsigned long)ts;
@@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
-        char rwbs[6];
+        char rwbs[RWBS_LEN];
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
        fill_rwbs(rwbs, t);
@@ -1561,7 +1566,7 @@ static const struct {
 } mask_maps[] = {
        { BLK_TC_READ,          "read"          },
        { BLK_TC_WRITE,         "write"         },
-        { BLK_TC_BARRIER,       "barrier"       },
+        { BLK_TC_FLUSH,         "flush"         },
        { BLK_TC_SYNC,          "sync"          },
        { BLK_TC_QUEUE,         "queue"         },
        { BLK_TC_REQUEUE,       "requeue"       },
@@ -1573,6 +1578,7 @@ static const struct {
        { BLK_TC_META,          "meta"          },
        { BLK_TC_DISCARD,       "discard"       },
        { BLK_TC_DRV_DATA,      "drv_data"      },
+        { BLK_TC_FUA,           "fua"           },
 };
 static int blk_trace_str2mask(const char *str)
@@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 {
        int i = 0;
+        if (rw & REQ_FLUSH)
+                rwbs[i++] = 'F';
        if (rw & WRITE)
                rwbs[i++] = 'W';
        else if (rw & REQ_DISCARD)
@@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        else
                rwbs[i++] = 'N';
+        if (rw & REQ_FUA)
+                rwbs[i++] = 'F';
        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
        if (rw & REQ_SYNC)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 908038f5744..798b16cd40f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,7 +32,6 @@
 #include <trace/events/sched.h>
-#include <asm/ftrace.h>
 #include <asm/setup.h>
 #include "trace_output.h"
@@ -82,14 +81,14 @@ static int ftrace_disabled __read_mostly;
 static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops ftrace_list_end __read_mostly =
+static struct ftrace_ops ftrace_list_end __read_mostly = {
-{
        .func           = ftrace_stub,
 };
 static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
@@ -148,6 +147,7 @@ void clear_ftrace_function(void)
 {
        ftrace_trace_function = ftrace_stub;
        __ftrace_trace_function = ftrace_stub;
+        __ftrace_trace_function_delay = ftrace_stub;
        ftrace_pid_function = ftrace_stub;
 }
@@ -210,7 +210,12 @@ static void update_ftrace_function(void)
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
        ftrace_trace_function = func;
 #else
+#ifdef CONFIG_DYNAMIC_FTRACE
+        /* do not update till all functions have been modified */
+        __ftrace_trace_function_delay = func;
+#else
        __ftrace_trace_function = func;
+#endif
        ftrace_trace_function = ftrace_test_stop_func;
 #endif
 }
@@ -785,8 +790,7 @@ static void unregister_ftrace_profiler(void)
        unregister_ftrace_graph();
 }
 #else
-static struct ftrace_ops ftrace_profile_ops __read_mostly =
+static struct ftrace_ops ftrace_profile_ops __read_mostly = {
-{
        .func           = function_profile_call,
 };
@@ -806,19 +810,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
        unsigned long val;
-        char buf[64];           /* big enough to hold a number */
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        val = !!val;
@@ -952,7 +947,7 @@ struct ftrace_func_probe {
 };
 enum {
-        FTRACE_ENABLE_CALLS             = (1 << 0),
+        FTRACE_UPDATE_CALLS             = (1 << 0),
        FTRACE_DISABLE_CALLS            = (1 << 1),
        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
        FTRACE_START_FUNC_RET           = (1 << 3),
@@ -1182,8 +1177,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
        return NULL;
 }
+static void
+ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
+static void
+ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
 static int
-ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+                 struct ftrace_hash **dst, struct ftrace_hash *src)
 {
        struct ftrace_func_entry *entry;
        struct hlist_node *tp, *tn;
@@ -1193,9 +1194,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
        unsigned long key;
        int size = src->count;
        int bits = 0;
+        int ret;
        int i;
        /*
+         * Remove the current set, update the hash and add
+         * them back.
+         */
+        ftrace_hash_rec_disable(ops, enable);
+        /*
         * If the new source is empty, just free dst and assign it
         * the empty_hash.
         */
@@ -1215,9 +1223,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
        if (bits > FTRACE_HASH_MAX_BITS)
                bits = FTRACE_HASH_MAX_BITS;
+        ret = -ENOMEM;
        new_hash = alloc_ftrace_hash(bits);
        if (!new_hash)
-                return -ENOMEM;
+                goto out;
        size = 1 << src->size_bits;
        for (i = 0; i < size; i++) {
@@ -1236,7 +1245,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
        rcu_assign_pointer(*dst, new_hash);
        free_ftrace_hash_rcu(old_hash);
-        return 0;
+        ret = 0;
+ out:
+        /*
+         * Enable regardless of ret:
+         *  On success, we enable the new hash.
+         *  On failure, we re-enable the original hash.
+         */
+        ftrace_hash_rec_enable(ops, enable);
+        return ret;
 }
 /*
@@ -1498,7 +1516,7 @@ int ftrace_text_reserved(void *start, void *end)
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int update)
 {
        unsigned long ftrace_addr;
        unsigned long flag = 0UL;
@@ -1506,17 +1524,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        ftrace_addr = (unsigned long)FTRACE_ADDR;
        /*
-         * If we are enabling tracing:
+         * If we are updating calls:
         *
         *   If the record has a ref count, then we need to enable it
         *   because someone is using it.
         *
         *   Otherwise we make sure its disabled.
         *
-         * If we are disabling tracing, then disable all records that
+         * If we are disabling calls, then disable all records that
         * are enabled.
         */
-        if (enable && (rec->flags & ~FTRACE_FL_MASK))
+        if (update && (rec->flags & ~FTRACE_FL_MASK))
                flag = FTRACE_FL_ENABLED;
        /* If the state of this record hasn't changed, then do nothing */
@@ -1532,7 +1550,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
-static void ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
 {
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
@@ -1546,7 +1564,7 @@ static void ftrace_replace_code(int enable)
                if (rec->flags & FTRACE_FL_FREE)
                        continue;
-                failed = __ftrace_replace_code(rec, enable);
+                failed = __ftrace_replace_code(rec, update);
                if (failed) {
                        ftrace_bug(failed, rec->ip);
                        /* Stop processing */
@@ -1596,7 +1614,13 @@ static int __ftrace_modify_code(void *data)
 {
        int *command = data;
-        if (*command & FTRACE_ENABLE_CALLS)
+        /*
+         * Do not call function tracer while we update the code.
+         * We are in stop machine, no worrying about races.
+         */
+        function_trace_stop++;
+        if (*command & FTRACE_UPDATE_CALLS)
                ftrace_replace_code(1);
        else if (*command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(0);
@@ -1609,6 +1633,18 @@ static int __ftrace_modify_code(void *data)
        else if (*command & FTRACE_STOP_FUNC_RET)
                ftrace_disable_ftrace_graph_caller();
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        /*
+         * For archs that call ftrace_test_stop_func(), we must
+         * wait till after we update all the function callers
+         * before we update the callback. This keeps different
+         * ops that record different functions from corrupting
+         * each other.
+         */
+        __ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+        function_trace_stop--;
        return 0;
 }
@@ -1652,7 +1688,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
                return -ENODEV;
        ftrace_start_up++;
-        command |= FTRACE_ENABLE_CALLS;
+        command |= FTRACE_UPDATE_CALLS;
        /* ops marked global share the filter hashes */
        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1704,8 +1740,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
        if (ops != &global_ops || !global_start_up)
                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
-        if (!ftrace_start_up)
+        command |= FTRACE_UPDATE_CALLS;
-                command |= FTRACE_DISABLE_CALLS;
        if (saved_ftrace_func != ftrace_trace_function) {
                saved_ftrace_func = ftrace_trace_function;
@@ -1727,7 +1762,7 @@ static void ftrace_startup_sysctl(void)
        saved_ftrace_func = NULL;
        /* ftrace_start_up is true if we want ftrace running */
        if (ftrace_start_up)
-                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 }
 static void ftrace_shutdown_sysctl(void)
@@ -1744,10 +1779,36 @@ static cycle_t		ftrace_update_time;
 static unsigned long    ftrace_update_cnt;
 unsigned long           ftrace_update_tot_cnt;
+static int ops_traces_mod(struct ftrace_ops *ops)
+{
+        struct ftrace_hash *hash;
+        hash = ops->filter_hash;
+        return !!(!hash || !hash->count);
+}
 static int ftrace_update_code(struct module *mod)
 {
        struct dyn_ftrace *p;
        cycle_t start, stop;
+        unsigned long ref = 0;
+        /*
+         * When adding a module, we need to check if tracers are
+         * currently enabled and if they are set to trace all functions.
+         * If they are, we need to enable the module functions as well
+         * as update the reference counts for those function records.
+         */
+        if (mod) {
+                struct ftrace_ops *ops;
+                for (ops = ftrace_ops_list;
+                     ops != &ftrace_list_end; ops = ops->next) {
+                        if (ops->flags & FTRACE_OPS_FL_ENABLED &&
+                            ops_traces_mod(ops))
+                                ref++;
+                }
+        }
        start = ftrace_now(raw_smp_processor_id());
        ftrace_update_cnt = 0;
@@ -1760,7 +1821,7 @@ static int ftrace_update_code(struct module *mod)
                p = ftrace_new_addrs;
                ftrace_new_addrs = p->newlist;
-                p->flags = 0L;
+                p->flags = ref;
                /*
                 * Do the initial record conversion from mcount jump
@@ -1783,7 +1844,7 @@ static int ftrace_update_code(struct module *mod)
                 * conversion puts the module to the correct state, thus
                 * passing the ftrace_make_call check.
                 */
-                if (ftrace_start_up) {
+                if (ftrace_start_up && ref) {
                        int failed = __ftrace_replace_code(p, 1);
                        if (failed) {
                                ftrace_bug(failed, p->ip);
@@ -2407,10 +2468,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
 */
 static int
-ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+ftrace_mod_callback(struct ftrace_hash *hash,
+                    char *func, char *cmd, char *param, int enable)
 {
-        struct ftrace_ops *ops = &global_ops;
-        struct ftrace_hash *hash;
        char *mod;
        int ret = -EINVAL;
@@ -2430,11 +2490,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
        if (!strlen(mod))
                return ret;
-        if (enable)
-                hash = ops->filter_hash;
-        else
-                hash = ops->notrace_hash;
        ret = ftrace_match_module_records(hash, func, mod);
        if (!ret)
                ret = -EINVAL;
@@ -2760,7 +2815,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
        mutex_lock(&ftrace_cmd_mutex);
        list_for_each_entry(p, &ftrace_commands, list) {
                if (strcmp(p->name, command) == 0) {
-                        ret = p->func(func, command, next, enable);
+                        ret = p->func(hash, func, command, next, enable);
                        goto out_unlock;
                }
        }
@@ -2857,7 +2912,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
                ftrace_match_records(hash, buf, len);
        mutex_lock(&ftrace_lock);
-        ret = ftrace_hash_move(orig_hash, hash);
+        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+        if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
+            && ftrace_enabled)
+                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
        mutex_unlock(&ftrace_lock);
        mutex_unlock(&ftrace_regex_lock);
@@ -3040,18 +3099,12 @@ ftrace_regex_release(struct inode *inode, struct file *file)
                        orig_hash = &iter->ops->notrace_hash;
                mutex_lock(&ftrace_lock);
-                /*
+                ret = ftrace_hash_move(iter->ops, filter_hash,
-                 * Remove the current set, update the hash and add
+                                       orig_hash, iter->hash);
-                 * them back.
+                if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
-                 */
+                    && ftrace_enabled)
-                ftrace_hash_rec_disable(iter->ops, filter_hash);
+                        ftrace_run_update_code(FTRACE_UPDATE_CALLS);
-                ret = ftrace_hash_move(orig_hash, iter->hash);
-                if (!ret) {
-                        ftrace_hash_rec_enable(iter->ops, filter_hash);
-                        if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
-                            && ftrace_enabled)
-                                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-                }
                mutex_unlock(&ftrace_lock);
        }
        free_ftrace_hash(iter->hash);
@@ -3330,7 +3383,7 @@ static int ftrace_process_locs(struct module *mod,
 {
        unsigned long *p;
        unsigned long addr;
-        unsigned long flags;
+        unsigned long flags = 0; /* Shut up gcc */
        mutex_lock(&ftrace_lock);
        p = start;
@@ -3348,12 +3401,18 @@ static int ftrace_process_locs(struct module *mod,
        }
        /*
-         * Disable interrupts to prevent interrupts from executing
+         * We only need to disable interrupts on start up
-         * code that is being modified.
+         * because we are modifying code that an interrupt
+         * may execute, and the modification is not atomic.
+         * But for modules, nothing runs the code we modify
+         * until we are finished with it, and there's no
+         * reason to cause large interrupt latencies while we do it.
         */
-        local_irq_save(flags);
+        if (!mod)
+                local_irq_save(flags);
        ftrace_update_code(mod);
-        local_irq_restore(flags);
+        if (!mod)
+                local_irq_restore(flags);
        mutex_unlock(&ftrace_lock);
        return 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa40794..731201bf4ac 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                             unsigned nr_pages)
 {
        struct buffer_page *bpage, *tmp;
-        unsigned long addr;
        LIST_HEAD(pages);
        unsigned i;
        WARN_ON(!nr_pages);
        for (i = 0; i < nr_pages; i++) {
+                struct page *page;
+                /*
+                 * __GFP_NORETRY flag makes sure that the allocation fails
+                 * gracefully without invoking oom-killer and the system is
+                 * not destabilized.
+                 */
                bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
-                                    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
+                                    GFP_KERNEL | __GFP_NORETRY,
+                                    cpu_to_node(cpu_buffer->cpu));
                if (!bpage)
                        goto free_pages;
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                list_add(&bpage->list, &pages);
-                addr = __get_free_page(GFP_KERNEL);
+                page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
-                if (!addr)
+                                        GFP_KERNEL | __GFP_NORETRY, 0);
+                if (!page)
                        goto free_pages;
-                bpage->page = (void *)addr;
+                bpage->page = page_address(page);
                rb_init_page(bpage->page);
        }
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct buffer_page *bpage;
-        unsigned long addr;
+        struct page *page;
        int ret;
        cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        rb_check_bpage(cpu_buffer, bpage);
        cpu_buffer->reader_page = bpage;
-        addr = __get_free_page(GFP_KERNEL);
+        page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
-        if (!addr)
+        if (!page)
                goto fail_free_reader;
-        bpage->page = (void *)addr;
+        bpage->page = page_address(page);
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        unsigned nr_pages, rm_pages, new_pages;
        struct buffer_page *bpage, *tmp;
        unsigned long buffer_size;
-        unsigned long addr;
        LIST_HEAD(pages);
        int i, cpu;
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        for_each_buffer_cpu(buffer, cpu) {
                for (i = 0; i < new_pages; i++) {
+                        struct page *page;
+                        /*
+                         * __GFP_NORETRY flag makes sure that the allocation
+                         * fails gracefully without invoking oom-killer and
+                         * the system is not destabilized.
+                         */
                        bpage = kzalloc_node(ALIGN(sizeof(*bpage),
                                                  cache_line_size()),
-                                            GFP_KERNEL, cpu_to_node(cpu));
+                                            GFP_KERNEL | __GFP_NORETRY,
+                                            cpu_to_node(cpu));
                        if (!bpage)
                                goto free_pages;
                        list_add(&bpage->list, &pages);
-                        addr = __get_free_page(GFP_KERNEL);
+                        page = alloc_pages_node(cpu_to_node(cpu),
-                        if (!addr)
+                                                GFP_KERNEL | __GFP_NORETRY, 0);
+                        if (!page)
                                goto free_pages;
-                        bpage->page = (void *)addr;
+                        bpage->page = page_address(page);
                        rb_init_page(bpage->page);
                }
        }
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 * Returns:
 *  The page allocated, or NULL on error.
 */
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
        struct buffer_data_page *bpage;
-        unsigned long addr;
+        struct page *page;
-        addr = __get_free_page(GFP_KERNEL);
+        page = alloc_pages_node(cpu_to_node(cpu),
-        if (!addr)
+                                GFP_KERNEL | __GFP_NORETRY, 0);
+        if (!page)
                return NULL;
-        bpage = (void *)addr;
+        bpage = page_address(page);
        rb_init_page(bpage);
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        unsigned long *p = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a61463..a5457d577b9 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
        int inc;
        int i;
-        bpage = ring_buffer_alloc_read_page(buffer);
+        bpage = ring_buffer_alloc_read_page(buffer, cpu);
        if (!bpage)
                return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee9c921d7f2..17a2d44e1af 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
+static void wakeup_work_handler(struct work_struct *work)
+{
+        wake_up(&trace_wait);
+}
+static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
 /**
 * trace_wake_up - wake up tasks waiting for trace input
 *
- * Simply wakes up any task that is blocked on the trace_wait
+ * Schedules a delayed work to wake up any task that is blocked on the
- * queue. These is used with trace_poll for tasks polling the trace.
+ * trace_wait queue. These is used with trace_poll for tasks polling the
+ * trace.
 */
 void trace_wake_up(void)
 {
-        int cpu;
+        const unsigned long delay = msecs_to_jiffies(2);
        if (trace_flags & TRACE_ITER_BLOCK)
                return;
-        /*
+        schedule_delayed_work(&wakeup_work, delay);
-         * The runqueue_is_locked() can fail, but this is the best we
-         * have for now:
-         */
-        cpu = get_cpu();
-        if (!runqueue_is_locked(cpu))
-                wake_up(&trace_wait);
-        put_cpu();
 }
 static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
        "graph-time",
        "record-cmd",
        "overwrite",
+        "disable_on_free",
        NULL
 };
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+                                            struct ring_buffer_event *event,
+                                            unsigned long flags, int pc,
+                                            struct pt_regs *regs)
+{
+        ring_buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+        ftrace_trace_userstack(buffer, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
                                         struct ring_buffer_event *event)
 {
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 }
 #ifdef CONFIG_STACKTRACE
+#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+struct ftrace_stack {
+        unsigned long           calls[FTRACE_STACK_MAX_ENTRIES];
+};
+static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(int, ftrace_stack_reserve);
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
                                 unsigned long flags,
-                                 int skip, int pc)
+                                 int skip, int pc, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        struct stack_entry *entry;
        struct stack_trace trace;
+        int use_stack;
+        int size = FTRACE_STACK_ENTRIES;
+        trace.nr_entries        = 0;
+        trace.skip              = skip;
+        /*
+         * Since events can happen in NMIs there's no safe way to
+         * use the per cpu ftrace_stacks. We reserve it and if an interrupt
+         * or NMI comes in, it will just have to use the default
+         * FTRACE_STACK_SIZE.
+         */
+        preempt_disable_notrace();
+        use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+        /*
+         * We don't need any atomic variables, just a barrier.
+         * If an interrupt comes in, we don't care, because it would
+         * have exited and put the counter back to what we want.
+         * We just need a barrier to keep gcc from moving things
+         * around.
+         */
+        barrier();
+        if (use_stack == 1) {
+                trace.entries           = &__get_cpu_var(ftrace_stack).calls[0];
+                trace.max_entries       = FTRACE_STACK_MAX_ENTRIES;
+                if (regs)
+                        save_stack_trace_regs(regs, &trace);
+                else
+                        save_stack_trace(&trace);
+                if (trace.nr_entries > size)
+                        size = trace.nr_entries;
+        } else
+                /* From now on, use_stack is a boolean */
+                use_stack = 0;
+        size *= sizeof(unsigned long);
        event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
-                                          sizeof(*entry), flags, pc);
+                                          sizeof(*entry) + size, flags, pc);
        if (!event)
-                return;
+                goto out;
-        entry   = ring_buffer_event_data(event);
+        entry = ring_buffer_event_data(event);
-        memset(&entry->caller, 0, sizeof(entry->caller));
-        trace.nr_entries        = 0;
+        memset(&entry->caller, 0, size);
-        trace.max_entries       = FTRACE_STACK_ENTRIES;
-        trace.skip              = skip;
+        if (use_stack)
-        trace.entries           = entry->caller;
+                memcpy(&entry->caller, trace.entries,
+                       trace.nr_entries * sizeof(unsigned long));
+        else {
+                trace.max_entries       = FTRACE_STACK_ENTRIES;
+                trace.entries           = entry->caller;
+                if (regs)
+                        save_stack_trace_regs(regs, &trace);
+                else
+                        save_stack_trace(&trace);
+        }
+        entry->size = trace.nr_entries;
-        save_stack_trace(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+ out:
+        /* Again, don't let gcc optimize things here */
+        barrier();
+        __get_cpu_var(ftrace_stack_reserve)--;
+        preempt_enable_notrace();
+}
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+                             int skip, int pc, struct pt_regs *regs)
+{
+        if (!(trace_flags & TRACE_ITER_STACKTRACE))
+                return;
+        __ftrace_trace_stack(buffer, flags, skip, pc, regs);
 }
 void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
        if (!(trace_flags & TRACE_ITER_STACKTRACE))
                return;
-        __ftrace_trace_stack(buffer, flags, skip, pc);
+        __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
 }
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc)
 {
-        __ftrace_trace_stack(tr->buffer, flags, skip, pc);
+        __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
 }
 /**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
        local_save_flags(flags);
        /* skipping 3 traces, seems to get us at the caller of this function */
-        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
 }
 static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
        ftrace_enable_cpu();
-        return event ? ring_buffer_event_data(event) : NULL;
+        if (event) {
+                iter->ent_size = ring_buffer_event_length(event);
+                return ring_buffer_event_data(event);
+        }
+        iter->ent_size = 0;
+        return NULL;
 }
 static struct trace_entry *
@@ -2051,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
 {
        struct trace_iterator *iter = m->private;
+        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                return;
        if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                /* print nothing if the buffers are empty */
                if (trace_empty(iter))
@@ -2701,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
 {
        struct trace_array *tr = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        val = !!val;
@@ -2767,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
        return t->init(tr);
 }
-static int tracing_resize_ring_buffer(unsigned long size)
+static int __tracing_resize_ring_buffer(unsigned long size)
 {
        int ret;
@@ -2819,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
        return ret;
 }
+static ssize_t tracing_resize_ring_buffer(unsigned long size)
+{
+        int cpu, ret = size;
+        mutex_lock(&trace_types_lock);
+        tracing_stop();
+        /* disable all cpu buffers */
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_inc(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_inc(&max_tr.data[cpu]->disabled);
+        }
+        if (size != global_trace.entries)
+                ret = __tracing_resize_ring_buffer(size);
+        if (ret < 0)
+                ret = -ENOMEM;
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_dec(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_dec(&max_tr.data[cpu]->disabled);
+        }
+        tracing_start();
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
 /**
 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2836,7 +2957,7 @@ int tracing_update_buffers(void)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded)
-                ret = tracing_resize_ring_buffer(trace_buf_size);
+                ret = __tracing_resize_ring_buffer(trace_buf_size);
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -2860,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded) {
-                ret = tracing_resize_ring_buffer(trace_buf_size);
+                ret = __tracing_resize_ring_buffer(trace_buf_size);
                if (ret < 0)
                        goto out;
                ret = 0;
@@ -2966,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
        unsigned long *ptr = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        *ptr = val * 1000;
@@ -3434,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
        unsigned long val;
-        char buf[64];
+        int ret;
-        int ret, cpu;
-        if (cnt >= sizeof(buf))
-                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-        if (ret < 0)
+        if (ret)
                return ret;
        /* must have at least 1 entry */
        if (!val)
                return -EINVAL;
-        mutex_lock(&trace_types_lock);
-        tracing_stop();
-        /* disable all cpu buffers */
-        for_each_tracing_cpu(cpu) {
-                if (global_trace.data[cpu])
-                        atomic_inc(&global_trace.data[cpu]->disabled);
-                if (max_tr.data[cpu])
-                        atomic_inc(&max_tr.data[cpu]->disabled);
-        }
        /* value is in KB */
        val <<= 10;
-        if (val != global_trace.entries) {
+        ret = tracing_resize_ring_buffer(val);
-                ret = tracing_resize_ring_buffer(val);
+        if (ret < 0)
-                if (ret < 0) {
+                return ret;
-                        cnt = ret;
-                        goto out;
-                }
-        }
        *ppos += cnt;
-        /* If check pages failed, return ENOMEM */
+        return cnt;
-        if (tracing_disabled)
+}
-                cnt = -ENOMEM;
- out:
-        for_each_tracing_cpu(cpu) {
-                if (global_trace.data[cpu])
-                        atomic_dec(&global_trace.data[cpu]->disabled);
-                if (max_tr.data[cpu])
-                        atomic_dec(&max_tr.data[cpu]->disabled);
-        }
-        tracing_start();
+static ssize_t
-        mutex_unlock(&trace_types_lock);
+tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
+                          size_t cnt, loff_t *ppos)
+{
+        /*
+         * There is no need to read what the user has written, this function
+         * is just to make sure that there is no error when "echo" is used
+         */
+        *ppos += cnt;
        return cnt;
 }
+static int
+tracing_free_buffer_release(struct inode *inode, struct file *filp)
+{
+        /* disable tracing ? */
+        if (trace_flags & TRACE_ITER_STOP_ON_FREE)
+                tracing_off();
+        /* resize the ring buffer to 0 */
+        tracing_resize_ring_buffer(0);
+        return 0;
+}
 static int mark_printk(const char *fmt, ...)
 {
        int ret;
@@ -3640,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
        .llseek         = generic_file_llseek,
 };
+static const struct file_operations tracing_free_buffer_fops = {
+        .write          = tracing_free_buffer_write,
+        .release        = tracing_free_buffer_release,
+};
 static const struct file_operations tracing_mark_fops = {
        .open           = tracing_open_generic,
        .write          = tracing_mark_write,
@@ -3696,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                return 0;
        if (!info->spare)
-                info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+                info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
        if (!info->spare)
                return -ENOMEM;
@@ -3704,8 +3808,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (info->read < PAGE_SIZE)
                goto read;
-        info->read = 0;
        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
@@ -3715,6 +3817,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (ret < 0)
                return 0;
+        info->read = 0;
 read:
        size = PAGE_SIZE - info->read;
        if (size > count)
@@ -3853,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                ref->ref = 1;
                ref->buffer = info->tr->buffer;
-                ref->page = ring_buffer_alloc_read_page(ref->buffer);
+                ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
                if (!ref->page) {
                        kfree(ref);
                        break;
@@ -3862,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                r = ring_buffer_read_page(ref->buffer, &ref->page,
                                          len, info->cpu, 1);
                if (r < 0) {
-                        ring_buffer_free_read_page(ref->buffer,
+                        ring_buffer_free_read_page(ref->buffer, ref->page);
-                                                   ref->page);
                        kfree(ref);
                        break;
                }
@@ -4099,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
 {
        struct trace_option_dentry *topt = filp->private_data;
        unsigned long val;
-        char buf[64];
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        if (val != 0 && val != 1)
@@ -4159,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
 {
        long index = (long)filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        if (val != 0 && val != 1)
@@ -4365,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("buffer_size_kb", 0644, d_tracer,
                        &global_trace, &tracing_entries_fops);
+        trace_create_file("free_buffer", 0644, d_tracer,
+                        &global_trace, &tracing_free_buffer_fops);
        trace_create_file("trace_marker", 0220, d_tracer,
                        NULL, &tracing_mark_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 229f8591f61..616846bcfee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2,7 +2,7 @@
 #define _LINUX_KERNEL_TRACE_H
 #include <linux/fs.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/ring_buffer.h>
@@ -278,6 +278,29 @@ struct tracer {
 };
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT              (1<<11)
+#define TRACE_GLOBAL_BIT                (1<<12)
+/*
+ * Abuse of the trace_recursion.
+ * As we need a way to maintain state if we are tracing the function
+ * graph in irq because we want to trace a particular function that
+ * was called in irq context but we have irq tracing off. Since this
+ * can only be modified by current, we can reuse trace_recursion.
+ */
+#define TRACE_IRQ_BIT                   (1<<13)
+#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
 #define TRACE_PIPE_ALL_CPU      -1
 int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
 void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
                        int skip, int pc);
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+                             int skip, int pc, struct pt_regs *regs);
 void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
                            int pc);
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
 {
 }
+static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
+                                           unsigned long flags, int skip,
+                                           int pc, struct pt_regs *regs)
+{
+}
 static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
                                          unsigned long flags, int pc)
 {
@@ -507,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
-                if (addr == ftrace_graph_funcs[i])
+                if (addr == ftrace_graph_funcs[i]) {
+                        /*
+                         * If no irqs are to be traced, but a set_graph_function
+                         * is set, and called by an interrupt handler, we still
+                         * want to trace it.
+                         */
+                        if (in_irq())
+                                trace_recursion_set(TRACE_IRQ_BIT);
+                        else
+                                trace_recursion_clear(TRACE_IRQ_BIT);
                        return 1;
+                }
        }
        return 0;
@@ -609,6 +651,7 @@ enum trace_iterator_flags {
        TRACE_ITER_GRAPH_TIME           = 0x80000,
        TRACE_ITER_RECORD_CMD           = 0x100000,
        TRACE_ITER_OVERWRITE            = 0x200000,
+        TRACE_ITER_STOP_ON_FREE         = 0x400000,
 };
 /*
@@ -677,6 +720,7 @@ struct event_subsystem {
        struct dentry           *entry;
        struct event_filter     *filter;
        int                     nr_events;
+        int                     ref_count;
 };
 #define FILTER_PRED_INVALID     ((unsigned short)-1)
@@ -784,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[];
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
 #include "trace_entries.h"
-/* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
-/* Ring buffer has the 10 LSB bits to count */
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
-/* for function tracing recursion */
-#define TRACE_INTERNAL_BIT              (1<<11)
-#define TRACE_GLOBAL_BIT                (1<<12)
-#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
-#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
-#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d9..93365907f21 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
        TRACE_STACK,
        F_STRUCT(
-                __array(        unsigned long,  caller, FTRACE_STACK_ENTRIES    )
+                __field(        int,            size    )
+                __dynamic_array(unsigned long,  caller  )
        ),
        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 686ec399f2a..c212a7f934e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
        mutex_unlock(&event_mutex);
 }
+static void __put_system(struct event_subsystem *system)
+{
+        struct event_filter *filter = system->filter;
+        WARN_ON_ONCE(system->ref_count == 0);
+        if (--system->ref_count)
+                return;
+        if (filter) {
+                kfree(filter->filter_string);
+                kfree(filter);
+        }
+        kfree(system->name);
+        kfree(system);
+}
+static void __get_system(struct event_subsystem *system)
+{
+        WARN_ON_ONCE(system->ref_count == 0);
+        system->ref_count++;
+}
+static void put_system(struct event_subsystem *system)
+{
+        mutex_lock(&event_mutex);
+        __put_system(system);
+        mutex_unlock(&event_mutex);
+}
 /*
 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        const char set_to_char[4] = { '?', '0', '1', 'X' };
-        const char *system = filp->private_data;
+        struct event_subsystem *system = filp->private_data;
        struct ftrace_event_call *call;
        char buf[2];
        int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                if (!call->name || !call->class || !call->class->reg)
                        continue;
-                if (system && strcmp(call->class->system, system) != 0)
+                if (system && strcmp(call->class->system, system->name) != 0)
                        continue;
                /*
@@ -569,21 +589,13 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                    loff_t *ppos)
 {
-        const char *system = filp->private_data;
+        struct event_subsystem *system = filp->private_data;
+        const char *name = NULL;
        unsigned long val;
-        char buf[64];
        ssize_t ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (val != 0 && val != 1)
                return -EINVAL;
-        ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+        /*
+         * Opening of "enable" adds a ref count to system,
+         * so the name is safe to use.
+         */
+        if (system)
+                name = system->name;
+        ret = __ftrace_set_clr_event(NULL, name, NULL, val);
        if (ret)
                goto out;
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        return cnt;
 }
+static LIST_HEAD(event_subsystems);
+static int subsystem_open(struct inode *inode, struct file *filp)
+{
+        struct event_subsystem *system = NULL;
+        int ret;
+        if (!inode->i_private)
+                goto skip_search;
+        /* Make sure the system still exists */
+        mutex_lock(&event_mutex);
+        list_for_each_entry(system, &event_subsystems, list) {
+                if (system == inode->i_private) {
+                        /* Don't open systems with no events */
+                        if (!system->nr_events) {
+                                system = NULL;
+                                break;
+                        }
+                        __get_system(system);
+                        break;
+                }
+        }
+        mutex_unlock(&event_mutex);
+        if (system != inode->i_private)
+                return -ENODEV;
+ skip_search:
+        ret = tracing_open_generic(inode, filp);
+        if (ret < 0 && system)
+                put_system(system);
+        return ret;
+}
+static int subsystem_release(struct inode *inode, struct file *file)
+{
+        struct event_subsystem *system = inode->i_private;
+        if (system)
+                put_system(system);
+        return 0;
+}
 static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
                      loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
 };
 static const struct file_operations ftrace_subsystem_filter_fops = {
-        .open = tracing_open_generic,
+        .open = subsystem_open,
        .read = subsystem_filter_read,
        .write = subsystem_filter_write,
        .llseek = default_llseek,
+        .release = subsystem_release,
 };
 static const struct file_operations ftrace_system_enable_fops = {
-        .open = tracing_open_generic,
+        .open = subsystem_open,
        .read = system_enable_read,
        .write = system_enable_write,
        .llseek = default_llseek,
+        .release = subsystem_release,
 };
 static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
        return d_events;
 }
-static LIST_HEAD(event_subsystems);
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
@@ -1035,6 +1100,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
        }
        system->nr_events = 1;
+        system->ref_count = 1;
        system->name = kstrdup(name, GFP_KERNEL);
        if (!system->name) {
                debugfs_remove(system->entry);
@@ -1062,8 +1128,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                           "'%s/filter' entry\n", name);
        }
-        trace_create_file("enable", 0644, system->entry,
+        trace_create_file("enable", 0644, system->entry, system,
-                          (void *)system->name,
                          &ftrace_system_enable_fops);
        return system->entry;
@@ -1184,16 +1249,9 @@ static void remove_subsystem_dir(const char *name)
        list_for_each_entry(system, &event_subsystems, list) {
                if (strcmp(system->name, name) == 0) {
                        if (!--system->nr_events) {
-                                struct event_filter *filter = system->filter;
                                debugfs_remove_recursive(system->entry);
                                list_del(&system->list);
-                                if (filter) {
+                                __put_system(system);
-                                        kfree(filter->filter_string);
-                                        kfree(filter);
-                                }
-                                kfree(system->name);
-                                kfree(system);
                        }
                        break;
                }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf2..bd3c6369f80 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1766,7 +1766,7 @@ static int replace_system_preds(struct event_subsystem *system,
                 * replace the filter for the call.
                 */
                filter = call->filter;
-                call->filter = filter_item->filter;
+                rcu_assign_pointer(call->filter, filter_item->filter);
                filter_item->filter = filter;
                fail = false;
@@ -1821,7 +1821,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                filter = call->filter;
                if (!filter)
                        goto out_unlock;
-                call->filter = NULL;
+                RCU_INIT_POINTER(call->filter, NULL);
                /* Make sure the filter is not being used */
                synchronize_sched();
                __free_filter(filter);
@@ -1862,7 +1862,7 @@ out:
         * string
         */
        tmp = call->filter;
-        call->filter = filter;
+        rcu_assign_pointer(call->filter, filter);
        if (tmp) {
                /* Make sure the call is done with the filter */
                synchronize_sched();
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        mutex_lock(&event_mutex);
+        /* Make sure the system still has events */
+        if (!system->nr_events) {
+                err = -ENODEV;
+                goto out_unlock;
+        }
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8d0e1cc4e97..c7b0c6a7db0 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 }
 static int
-ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+                            char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
        void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed8..a7d2a4c653d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
 static struct trace_array *graph_array;
+/*
+ * DURATION column is being also used to display IRQ signs,
+ * following values are used by print_graph_irq and others
+ * to fill in space into DURATION column.
+ */
+enum {
+        DURATION_FILL_FULL  = -1,
+        DURATION_FILL_START = -2,
+        DURATION_FILL_END   = -3,
+};
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+                     u32 flags);
 /* Add a function return address to the trace stack on thread info.*/
 int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
 static inline int ftrace_graph_ignore_irqs(void)
 {
-        if (!ftrace_graph_skip_irqs)
+        if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
                return 0;
        return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
        return next;
 }
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s,
-                     u32 flags)
-{
-        /* If duration disappear, we don't need anything */
-        if (!(flags & TRACE_GRAPH_PRINT_DURATION))
-                return 1;
-        /* Non nested entry or return */
-        if (duration == -1)
-                return trace_seq_printf(s, "  ");
-        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
-                /* Duration exceeded 100 msecs */
-                if (duration > 100000ULL)
-                        return trace_seq_printf(s, "! ");
-                /* Duration exceeded 10 msecs */
-                if (duration > 10000ULL)
-                        return trace_seq_printf(s, "+ ");
-        }
-        return trace_seq_printf(s, "  ");
-}
 static int print_graph_abs_time(u64 t, struct trace_seq *s)
 {
        unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                addr >= (unsigned long)__irqentry_text_end)
                return TRACE_TYPE_UNHANDLED;
-        /* Absolute time */
+        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
+                /* Absolute time */
-                ret = print_graph_abs_time(iter->ts, s);
+                if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
-                if (!ret)
+                        ret = print_graph_abs_time(iter->ts, s);
-                        return TRACE_TYPE_PARTIAL_LINE;
+                        if (!ret)
-        }
+                                return TRACE_TYPE_PARTIAL_LINE;
+                }
-        /* Cpu */
+                /* Cpu */
-        if (flags & TRACE_GRAPH_PRINT_CPU) {
+                if (flags & TRACE_GRAPH_PRINT_CPU) {
-                ret = print_graph_cpu(s, cpu);
+                        ret = print_graph_cpu(s, cpu);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
+                        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
+                                return TRACE_TYPE_PARTIAL_LINE;
-        }
+                }
-        /* Proc */
+                /* Proc */
-        if (flags & TRACE_GRAPH_PRINT_PROC) {
+                if (flags & TRACE_GRAPH_PRINT_PROC) {
-                ret = print_graph_proc(s, pid);
+                        ret = print_graph_proc(s, pid);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
+                        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
+                                return TRACE_TYPE_PARTIAL_LINE;
-                ret = trace_seq_printf(s, " | ");
+                        ret = trace_seq_printf(s, " | ");
-                if (!ret)
+                        if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
+                                return TRACE_TYPE_PARTIAL_LINE;
+                }
        }
        /* No overhead */
-        ret = print_graph_overhead(-1, s, flags);
+        ret = print_graph_duration(DURATION_FILL_START, s, flags);
-        if (!ret)
+        if (ret != TRACE_TYPE_HANDLED)
-                return TRACE_TYPE_PARTIAL_LINE;
+                return ret;
        if (type == TRACE_GRAPH_ENT)
                ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Don't close the duration column if haven't one */
+        ret = print_graph_duration(DURATION_FILL_END, s, flags);
-        if (flags & TRACE_GRAPH_PRINT_DURATION)
+        if (ret != TRACE_TYPE_HANDLED)
-                trace_seq_printf(s, " |");
+                return ret;
        ret = trace_seq_printf(s, "\n");
        if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 }
 static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+                     u32 flags)
 {
-        int ret;
+        int ret = -1;
+        if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
+            !(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                        return TRACE_TYPE_HANDLED;
+        /* No real adata, just filling the column with spaces */
+        switch (duration) {
+        case DURATION_FILL_FULL:
+                ret = trace_seq_printf(s, "              |  ");
+                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        case DURATION_FILL_START:
+                ret = trace_seq_printf(s, "  ");
+                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        case DURATION_FILL_END:
+                ret = trace_seq_printf(s, " |");
+                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        }
+        /* Signal a overhead of time execution to the output */
+        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
+                /* Duration exceeded 100 msecs */
+                if (duration > 100000ULL)
+                        ret = trace_seq_printf(s, "! ");
+                /* Duration exceeded 10 msecs */
+                else if (duration > 10000ULL)
+                        ret = trace_seq_printf(s, "+ ");
+        }
+        /*
+         * The -1 means we either did not exceed the duration tresholds
+         * or we dont want to print out the overhead. Either way we need
+         * to fill out the space.
+         */
+        if (ret == -1)
+                ret = trace_seq_printf(s, "  ");
+        /* Catching here any failure happenned above */
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
        ret = trace_print_graph_duration(duration, s);
        if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
                        cpu_data->enter_funcs[call->depth] = 0;
        }
-        /* Overhead */
+        /* Overhead and duration */
-        ret = print_graph_overhead(duration, s, flags);
+        ret = print_graph_duration(duration, s, flags);
-        if (!ret)
+        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Duration */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
-                ret = print_graph_duration(duration, s);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Function */
        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
                ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
                        cpu_data->enter_funcs[call->depth] = call->func;
        }
-        /* No overhead */
-        ret = print_graph_overhead(-1, s, flags);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
+        ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
-                ret = trace_seq_printf(s, "            |  ");
+        if (ret != TRACE_TYPE_HANDLED)
-                if (!ret)
+                return ret;
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Function */
        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
+        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                return 0;
        /* Absolute time */
        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Overhead */
+        /* Overhead and duration */
-        ret = print_graph_overhead(duration, s, flags);
+        ret = print_graph_duration(duration, s, flags);
-        if (!ret)
+        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Duration */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
-                ret = print_graph_duration(duration, s);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Closing brace */
        for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
                ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
-        /* No overhead */
-        ret = print_graph_overhead(-1, s, flags);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
+        ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
-                ret = trace_seq_printf(s, "            |  ");
+        if (ret != TRACE_TYPE_HANDLED)
-                if (!ret)
+                return ret;
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Indentation */
        if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 enum print_line_t
-__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-        return __print_graph_function_flags(iter, tracer_flags.val);
+        return print_graph_function_flags(iter, tracer_flags.val);
-}
-enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
-                                             u32 flags)
-{
-        if (trace_flags & TRACE_ITER_LATENCY_FMT)
-                flags |= TRACE_GRAPH_PRINT_DURATION;
-        else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-        return __print_graph_function_flags(iter, flags);
 }
 static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
        seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces);
        seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
        seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces);
-        seq_printf(s, "#%.*s||| / _-=> lock-depth      \n", size, spaces);
+        seq_printf(s, "#%.*s||| /                      \n", size, spaces);
-        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "  TASK/PID       ");
        if (lat)
-                seq_printf(s, "|||||");
+                seq_printf(s, "||||");
        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "  DURATION   ");
        seq_printf(s, "               FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "   |    |        ");
        if (lat)
-                seq_printf(s, "|||||");
+                seq_printf(s, "||||");
        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "   |   |      ");
        seq_printf(s, "               |   |   |   |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        struct trace_iterator *iter = s->private;
+        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                return;
        if (trace_flags & TRACE_ITER_LATENCY_FMT) {
                /* print nothing if the buffers are empty */
                if (trace_empty(iter))
                        return;
                print_trace_header(s, iter);
-                flags |= TRACE_GRAPH_PRINT_DURATION;
+        }
-        } else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
        __print_graph_headers_flags(s, flags);
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424be284..667aa8cc0cf 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 }
 #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
-                            TRACE_GRAPH_PRINT_PROC)
+                            TRACE_GRAPH_PRINT_PROC | \
+                            TRACE_GRAPH_PRINT_ABS_TIME | \
+                            TRACE_GRAPH_PRINT_DURATION)
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27d13b36b8b..00d527c945a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
 DEFINE_FETCH_deref(string)
 DEFINE_FETCH_deref(string_size)
+static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
 static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
 #define fetch_bitfield_string_size NULL
 static __kprobes void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
+static __kprobes void
 free_bitfield_fetch_param(struct bitfield_fetch_param *data)
 {
        /*
@@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
                free_symbol_cache(data->orig.data);
        kfree(data);
 }
 /* Default (unsigned long) fetch type */
 #define __DEFAULT_FETCH_TYPE(t) u##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -536,6 +558,7 @@ struct probe_arg {
 /* Flags for trace_probe */
 #define TP_FLAG_TRACE   1
 #define TP_FLAG_PROFILE 2
+#define TP_FLAG_REGISTERED 4
 struct trace_probe {
        struct list_head        list;
@@ -555,16 +578,49 @@ struct trace_probe {
        (sizeof(struct probe_arg) * (n)))
-static __kprobes int probe_is_return(struct trace_probe *tp)
+static __kprobes int trace_probe_is_return(struct trace_probe *tp)
 {
        return tp->rp.handler != NULL;
 }
-static __kprobes const char *probe_symbol(struct trace_probe *tp)
+static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
 {
        return tp->symbol ? tp->symbol : "unknown";
 }
+static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
+{
+        return tp->rp.kp.offset;
+}
+static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
+{
+        return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+}
+static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
+{
+        return !!(tp->flags & TP_FLAG_REGISTERED);
+}
+static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
+{
+        return !!(kprobe_gone(&tp->rp.kp));
+}
+static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
+                                                struct module *mod)
+{
+        int len = strlen(mod->name);
+        const char *name = trace_probe_symbol(tp);
+        return strncmp(mod->name, name, len) == 0 && name[len] == ':';
+}
+static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
+{
+        return !!strchr(trace_probe_symbol(tp), ':');
+}
 static int register_probe_event(struct trace_probe *tp);
 static void unregister_probe_event(struct trace_probe *tp);
@@ -646,6 +702,16 @@ error:
        return ERR_PTR(ret);
 }
+static void update_probe_arg(struct probe_arg *arg)
+{
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                update_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+                update_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+                update_symbol_cache(arg->fetch.data);
+}
 static void free_probe_arg(struct probe_arg *arg)
 {
        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -671,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
        kfree(tp);
 }
-static struct trace_probe *find_probe_event(const char *event,
+static struct trace_probe *find_trace_probe(const char *event,
                                            const char *group)
 {
        struct trace_probe *tp;
@@ -683,15 +749,104 @@ static struct trace_probe *find_probe_event(const char *event,
        return NULL;
 }
-/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static void unregister_trace_probe(struct trace_probe *tp)
+static int enable_trace_probe(struct trace_probe *tp, int flag)
 {
-        if (probe_is_return(tp))
+        int ret = 0;
-                unregister_kretprobe(&tp->rp);
+        tp->flags |= flag;
+        if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
+            !trace_probe_has_gone(tp)) {
+                if (trace_probe_is_return(tp))
+                        ret = enable_kretprobe(&tp->rp);
+                else
+                        ret = enable_kprobe(&tp->rp.kp);
+        }
+        return ret;
+}
+/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
+static void disable_trace_probe(struct trace_probe *tp, int flag)
+{
+        tp->flags &= ~flag;
+        if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
+                if (trace_probe_is_return(tp))
+                        disable_kretprobe(&tp->rp);
+                else
+                        disable_kprobe(&tp->rp.kp);
+        }
+}
+/* Internal register function - just handle k*probes and flags */
+static int __register_trace_probe(struct trace_probe *tp)
+{
+        int i, ret;
+        if (trace_probe_is_registered(tp))
+                return -EINVAL;
+        for (i = 0; i < tp->nr_args; i++)
+                update_probe_arg(&tp->args[i]);
+        /* Set/clear disabled flag according to tp->flag */
+        if (trace_probe_is_enabled(tp))
+                tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
+        else
+                tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+        if (trace_probe_is_return(tp))
+                ret = register_kretprobe(&tp->rp);
        else
-                unregister_kprobe(&tp->rp.kp);
+                ret = register_kprobe(&tp->rp.kp);
+        if (ret == 0)
+                tp->flags |= TP_FLAG_REGISTERED;
+        else {
+                pr_warning("Could not insert probe at %s+%lu: %d\n",
+                           trace_probe_symbol(tp), trace_probe_offset(tp), ret);
+                if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
+                        pr_warning("This probe might be able to register after"
+                                   "target module is loaded. Continue.\n");
+                        ret = 0;
+                } else if (ret == -EILSEQ) {
+                        pr_warning("Probing address(0x%p) is not an "
+                                   "instruction boundary.\n",
+                                   tp->rp.kp.addr);
+                        ret = -EINVAL;
+                }
+        }
+        return ret;
+}
+/* Internal unregister function - just handle k*probes and flags */
+static void __unregister_trace_probe(struct trace_probe *tp)
+{
+        if (trace_probe_is_registered(tp)) {
+                if (trace_probe_is_return(tp))
+                        unregister_kretprobe(&tp->rp);
+                else
+                        unregister_kprobe(&tp->rp.kp);
+                tp->flags &= ~TP_FLAG_REGISTERED;
+                /* Cleanup kprobe for reuse */
+                if (tp->rp.kp.symbol_name)
+                        tp->rp.kp.addr = NULL;
+        }
+}
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static int unregister_trace_probe(struct trace_probe *tp)
+{
+        /* Enabled event can not be unregistered */
+        if (trace_probe_is_enabled(tp))
+                return -EBUSY;
+        __unregister_trace_probe(tp);
        list_del(&tp->list);
        unregister_probe_event(tp);
+        return 0;
 }
 /* Register a trace_probe and probe_event */
@@ -702,41 +857,68 @@ static int register_trace_probe(struct trace_probe *tp)
        mutex_lock(&probe_lock);
-        /* register as an event */
+        /* Delete old (same name) event if exist */
-        old_tp = find_probe_event(tp->call.name, tp->call.class->system);
+        old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
        if (old_tp) {
-                /* delete old event */
+                ret = unregister_trace_probe(old_tp);
-                unregister_trace_probe(old_tp);
+                if (ret < 0)
+                        goto end;
                free_trace_probe(old_tp);
        }
+        /* Register new event */
        ret = register_probe_event(tp);
        if (ret) {
                pr_warning("Failed to register probe event(%d)\n", ret);
                goto end;
        }
-        tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+        /* Register k*probe */
-        if (probe_is_return(tp))
+        ret = __register_trace_probe(tp);
-                ret = register_kretprobe(&tp->rp);
+        if (ret < 0)
-        else
-                ret = register_kprobe(&tp->rp.kp);
-        if (ret) {
-                pr_warning("Could not insert probe(%d)\n", ret);
-                if (ret == -EILSEQ) {
-                        pr_warning("Probing address(0x%p) is not an "
-                                   "instruction boundary.\n",
-                                   tp->rp.kp.addr);
-                        ret = -EINVAL;
-                }
                unregister_probe_event(tp);
-        } else
+        else
                list_add_tail(&tp->list, &probe_list);
 end:
        mutex_unlock(&probe_lock);
        return ret;
 }
+/* Module notifier call back, checking event on the module */
+static int trace_probe_module_callback(struct notifier_block *nb,
+                                       unsigned long val, void *data)
+{
+        struct module *mod = data;
+        struct trace_probe *tp;
+        int ret;
+        if (val != MODULE_STATE_COMING)
+                return NOTIFY_DONE;
+        /* Update probes on coming module */
+        mutex_lock(&probe_lock);
+        list_for_each_entry(tp, &probe_list, list) {
+                if (trace_probe_within_module(tp, mod)) {
+                        /* Don't need to check busy - this should have gone. */
+                        __unregister_trace_probe(tp);
+                        ret = __register_trace_probe(tp);
+                        if (ret)
+                                pr_warning("Failed to re-register probe %s on"
+                                           "%s: %d\n",
+                                           tp->call.name, mod->name, ret);
+                }
+        }
+        mutex_unlock(&probe_lock);
+        return NOTIFY_DONE;
+}
+static struct notifier_block trace_probe_module_nb = {
+        .notifier_call = trace_probe_module_callback,
+        .priority = 1   /* Invoked after kprobe module callback */
+};
 /* Split symbol and offset. */
 static int split_symbol_offset(char *symbol, unsigned long *offset)
 {
@@ -962,8 +1144,8 @@ static int create_trace_probe(int argc, char **argv)
 {
        /*
         * Argument syntax:
-         *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+         *  - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
-         *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+         *  - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
         * Fetch args:
         *  $retval     : fetch return value
         *  $stack      : fetch stack address
@@ -1025,17 +1207,18 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                mutex_lock(&probe_lock);
-                tp = find_probe_event(event, group);
+                tp = find_trace_probe(event, group);
                if (!tp) {
                        mutex_unlock(&probe_lock);
                        pr_info("Event %s/%s doesn't exist.\n", group, event);
                        return -ENOENT;
                }
                /* delete an event */
-                unregister_trace_probe(tp);
+                ret = unregister_trace_probe(tp);
-                free_trace_probe(tp);
+                if (ret == 0)
+                        free_trace_probe(tp);
                mutex_unlock(&probe_lock);
-                return 0;
+                return ret;
        }
        if (argc < 2) {
@@ -1144,20 +1327,30 @@ error:
        return ret;
 }
-static void cleanup_all_probes(void)
+static int release_all_trace_probes(void)
 {
        struct trace_probe *tp;
+        int ret = 0;
        mutex_lock(&probe_lock);
+        /* Ensure no probe is in use. */
+        list_for_each_entry(tp, &probe_list, list)
+                if (trace_probe_is_enabled(tp)) {
+                        ret = -EBUSY;
+                        goto end;
+                }
        /* TODO: Use batch unregistration */
        while (!list_empty(&probe_list)) {
                tp = list_entry(probe_list.next, struct trace_probe, list);
                unregister_trace_probe(tp);
                free_trace_probe(tp);
        }
+end:
        mutex_unlock(&probe_lock);
-}
+        return ret;
+}
 /* Probes listing interfaces */
 static void *probes_seq_start(struct seq_file *m, loff_t *pos)
@@ -1181,15 +1374,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
        struct trace_probe *tp = v;
        int i;
-        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+        seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
        seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
        if (!tp->symbol)
                seq_printf(m, " 0x%p", tp->rp.kp.addr);
        else if (tp->rp.kp.offset)
-                seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+                seq_printf(m, " %s+%u", trace_probe_symbol(tp),
+                           tp->rp.kp.offset);
        else
-                seq_printf(m, " %s", probe_symbol(tp));
+                seq_printf(m, " %s", trace_probe_symbol(tp));
        for (i = 0; i < tp->nr_args; i++)
                seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1207,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
 static int probes_open(struct inode *inode, struct file *file)
 {
-        if ((file->f_mode & FMODE_WRITE) &&
+        int ret;
-            (file->f_flags & O_TRUNC))
-                cleanup_all_probes();
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+                ret = release_all_trace_probes();
+                if (ret < 0)
+                        return ret;
+        }
        return seq_open(file, &probes_seq_op);
 }
@@ -1397,7 +1595,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+                trace_nowake_buffer_unlock_commit_regs(buffer, event,
+                                                       irq_flags, pc, regs);
 }
 /* Kretprobe handler */
@@ -1429,7 +1628,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+                trace_nowake_buffer_unlock_commit_regs(buffer, event,
+                                                       irq_flags, pc, regs);
 }
 /* Event entry printers */
@@ -1511,30 +1711,6 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static int probe_event_enable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags |= TP_FLAG_TRACE;
-        if (probe_is_return(tp))
-                return enable_kretprobe(&tp->rp);
-        else
-                return enable_kprobe(&tp->rp.kp);
-}
-static void probe_event_disable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags &= ~TP_FLAG_TRACE;
-        if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
-                if (probe_is_return(tp))
-                        disable_kretprobe(&tp->rp);
-                else
-                        disable_kprobe(&tp->rp.kp);
-        }
-}
 #undef DEFINE_FIELD
 #define DEFINE_FIELD(type, item, name, is_signed)                       \
        do {                                                            \
@@ -1596,7 +1772,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
        const char *fmt, *arg;
-        if (!probe_is_return(tp)) {
+        if (!trace_probe_is_return(tp)) {
                fmt = "(%lx)";
                arg = "REC->" FIELD_STRING_IP;
        } else {
@@ -1713,49 +1889,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        head = this_cpu_ptr(call->perf_events);
        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
 }
-static int probe_perf_enable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags |= TP_FLAG_PROFILE;
-        if (probe_is_return(tp))
-                return enable_kretprobe(&tp->rp);
-        else
-                return enable_kprobe(&tp->rp.kp);
-}
-static void probe_perf_disable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags &= ~TP_FLAG_PROFILE;
-        if (!(tp->flags & TP_FLAG_TRACE)) {
-                if (probe_is_return(tp))
-                        disable_kretprobe(&tp->rp);
-                else
-                        disable_kprobe(&tp->rp.kp);
-        }
-}
 #endif  /* CONFIG_PERF_EVENTS */
 static __kprobes
 int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
 {
+        struct trace_probe *tp = (struct trace_probe *)event->data;
        switch (type) {
        case TRACE_REG_REGISTER:
-                return probe_event_enable(event);
+                return enable_trace_probe(tp, TP_FLAG_TRACE);
        case TRACE_REG_UNREGISTER:
-                probe_event_disable(event);
+                disable_trace_probe(tp, TP_FLAG_TRACE);
                return 0;
 #ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
-                return probe_perf_enable(event);
+                return enable_trace_probe(tp, TP_FLAG_PROFILE);
        case TRACE_REG_PERF_UNREGISTER:
-                probe_perf_disable(event);
+                disable_trace_probe(tp, TP_FLAG_PROFILE);
                return 0;
 #endif
        }
@@ -1805,7 +1957,7 @@ static int register_probe_event(struct trace_probe *tp)
        /* Initialize ftrace_event_call */
        INIT_LIST_HEAD(&call->class->fields);
-        if (probe_is_return(tp)) {
+        if (trace_probe_is_return(tp)) {
                call->event.funcs = &kretprobe_funcs;
                call->class->define_fields = kretprobe_event_define_fields;
        } else {
@@ -1844,6 +1996,9 @@ static __init int init_kprobe_trace(void)
        struct dentry *d_tracer;
        struct dentry *entry;
+        if (register_module_notifier(&trace_probe_module_nb))
+                return -EINVAL;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                return 0;
@@ -1897,12 +2052,12 @@ static __init int kprobe_trace_self_tests_init(void)
                warn++;
        } else {
                /* Enable trace point */
-                tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+                tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
                if (WARN_ON_ONCE(tp == NULL)) {
                        pr_warning("error on getting new probe.\n");
                        warn++;
                } else
-                        probe_event_enable(&tp->call);
+                        enable_trace_probe(tp, TP_FLAG_TRACE);
        }
        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1912,12 +2067,12 @@ static __init int kprobe_trace_self_tests_init(void)
                warn++;
        } else {
                /* Enable trace point */
-                tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+                tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
                if (WARN_ON_ONCE(tp == NULL)) {
                        pr_warning("error on getting new probe.\n");
                        warn++;
                } else
-                        probe_event_enable(&tp->call);
+                        enable_trace_probe(tp, TP_FLAG_TRACE);
        }
        if (warn)
@@ -1925,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
        ret = target(1, 2, 3, 4, 5, 6);
+        /* Disable trace points before removing it */
+        tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+        if (WARN_ON_ONCE(tp == NULL)) {
+                pr_warning("error on getting test probe.\n");
+                warn++;
+        } else
+                disable_trace_probe(tp, TP_FLAG_TRACE);
+        tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+        if (WARN_ON_ONCE(tp == NULL)) {
+                pr_warning("error on getting 2nd test probe.\n");
+                warn++;
+        } else
+                disable_trace_probe(tp, TP_FLAG_TRACE);
        ret = command_trace_probe("-:testprobe");
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
@@ -1938,7 +2108,7 @@ static __init int kprobe_trace_self_tests_init(void)
        }
 end:
-        cleanup_all_probes();
+        release_all_trace_probes();
        if (warn)
                pr_cont("NG: Some tests are failed. Please check them.\n");
        else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 017fa376505..fd3c8aae55e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "trace.h"
 #include "trace_output.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e37de492a9e..51999309a6c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 {
        struct stack_entry *field;
        struct trace_seq *s = &iter->seq;
-        int i;
+        unsigned long *p;
+        unsigned long *end;
        trace_assign_type(field, iter->ent);
+        end = (unsigned long *)((long)iter->ent + iter->ent_size);
        if (!trace_seq_puts(s, "<stack trace>\n"))
                goto partial;
-        for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-                if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
+        for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
-                        break;
                if (!trace_seq_puts(s, " => "))
                        goto partial;
-                if (!seq_print_ip_sym(s, field->caller[i], flags))
+                if (!seq_print_ip_sym(s, *p, flags))
                        goto partial;
                if (!trace_seq_puts(s, "\n"))
                        goto partial;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4fd2c..e4a70c0c71b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
                graph_trace_close(iter);
 }
-#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
+                            TRACE_GRAPH_PRINT_ABS_TIME | \
+                            TRACE_GRAPH_PRINT_DURATION)
 static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8e4c2..77575b386d9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 {
        long *ptr = filp->private_data;
        unsigned long val, flags;
-        char buf[64];
        int ret;
        int cpu;
-        if (count >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, count, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, count))
-                return -EFAULT;
-        buf[count] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        local_irq_save(flags);
diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c
new file mode 100644
index 00000000000..a83532bc36d
--- /dev/null
+++ b/kernel/trace/tracedump.c
@@ -0,0 +1,682 @@
+/*
+ * kernel/trace/tracedump.c
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/console.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/ring_buffer.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/threads.h>
+#include <linux/tracedump.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+#include "trace.h"
+#include "trace_output.h"
+#define CPU_MAX (NR_CPUS-1)
+#define TRYM(fn, ...) do {              \
+        int try_error = (fn);           \
+        if (try_error < 0) {            \
+                printk(__VA_ARGS__);    \
+                return try_error;       \
+        }                               \
+} while (0)
+#define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__)
+/* Stolen from printk.c */
+#define for_each_console(con) \
+        for (con = console_drivers; con != NULL; con = con->next)
+#define TAG KERN_ERR "tracedump: "
+#define TD_MIN_CONSUME 2000
+#define TD_COMPRESS_CHUNK 0x8000
+static DEFINE_MUTEX(tracedump_proc_lock);
+static const char MAGIC_NUMBER[9] = "TRACEDUMP";
+static const char CPU_DELIM[7] = "CPU_END";
+#define CMDLINE_DELIM "|"
+/* Type of output */
+static bool current_format;
+static bool format_ascii;
+module_param(format_ascii, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data");
+/* Max size of output */
+static uint panic_size = 0x80000;
+module_param(panic_size, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)");
+static uint compress_level = 9;
+module_param(compress_level, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]");
+static char out_buf[TD_COMPRESS_CHUNK];
+static z_stream stream;
+static int compress_done;
+static int flush;
+static int old_trace_flags;
+static struct trace_iterator iter;
+static struct pager_s {
+        struct trace_array      *tr;
+        void                    *spare;
+        int                     cpu;
+        int                     len;
+        char __user             *ubuf;
+} pager;
+static char cmdline_buf[16+TASK_COMM_LEN];
+static int print_to_console(const char *buf, size_t len)
+{
+        struct console *con;
+        /* Stolen from printk.c */
+        for_each_console(con) {
+                if ((con->flags & CON_ENABLED) && con->write &&
+                   (cpu_online(smp_processor_id()) ||
+                   (con->flags & CON_ANYTIME)))
+                        con->write(con, buf, len);
+        }
+        return 0;
+}
+static int print_to_user(const char *buf, size_t len)
+{
+        int size;
+        size = copy_to_user(pager.ubuf, buf, len);
+        if (size > 0) {
+                printk(TAG "Failed to copy to user %d bytes\n", size);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int print(const char *buf, size_t len, int print_to)
+{
+        if (print_to == TD_PRINT_CONSOLE)
+                TRY(print_to_console(buf, len));
+        else if (print_to == TD_PRINT_USER)
+                TRY(print_to_user(buf, len));
+        return 0;
+}
+/* print_magic will print MAGIC_NUMBER using the
+ * print function selected by print_to.
+ */
+static inline ssize_t print_magic(int print_to)
+{
+        print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to);
+        return sizeof(MAGIC_NUMBER);
+}
+static int iter_init(void)
+{
+        int cpu;
+        /* Make iter point to global ring buffer used in trace. */
+        trace_init_global_iter(&iter);
+        /* Disable tracing */
+        for_each_tracing_cpu(cpu) {
+                atomic_inc(&iter.tr->data[cpu]->disabled);
+        }
+        /* Save flags */
+        old_trace_flags = trace_flags;
+        /* Dont look at memory in panic mode. */
+        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+        /* Prepare ring buffer iter */
+        for_each_tracing_cpu(cpu) {
+                iter.buffer_iter[cpu] =
+                 ring_buffer_read_prepare(iter.tr->buffer, cpu);
+        }
+        ring_buffer_read_prepare_sync();
+        for_each_tracing_cpu(cpu) {
+                ring_buffer_read_start(iter.buffer_iter[cpu]);
+                tracing_iter_reset(&iter, cpu);
+        }
+        return 0;
+}
+/* iter_next gets the next entry in the ring buffer, ordered by time.
+ * If there are no more entries, returns 0.
+ */
+static ssize_t iter_next(void)
+{
+        /* Zero out the iterator's seq */
+        memset(&iter.seq, 0,
+                sizeof(struct trace_iterator) -
+                offsetof(struct trace_iterator, seq));
+        while (!trace_empty(&iter)) {
+                if (trace_find_next_entry_inc(&iter) == NULL) {
+                        printk(TAG "trace_find_next_entry failed!\n");
+                        return -EINVAL;
+                }
+                /* Copy the ring buffer data to iterator's seq */
+                print_trace_line(&iter);
+                if (iter.seq.len != 0)
+                        return iter.seq.len;
+        }
+        return 0;
+}
+static int iter_deinit(void)
+{
+        int cpu;
+        /* Enable tracing */
+        for_each_tracing_cpu(cpu) {
+                ring_buffer_read_finish(iter.buffer_iter[cpu]);
+        }
+        for_each_tracing_cpu(cpu) {
+                atomic_dec(&iter.tr->data[cpu]->disabled);
+        }
+        /* Restore flags */
+        trace_flags = old_trace_flags;
+        return 0;
+}
+static int pager_init(void)
+{
+        int cpu;
+        /* Need to do this to get a pointer to global_trace (iter.tr).
+           Lame, I know. */
+        trace_init_global_iter(&iter);
+        /* Turn off tracing */
+        for_each_tracing_cpu(cpu) {
+                atomic_inc(&iter.tr->data[cpu]->disabled);
+        }
+        memset(&pager, 0, sizeof(pager));
+        pager.tr = iter.tr;
+        pager.len = TD_COMPRESS_CHUNK;
+        return 0;
+}
+/* pager_next_cpu moves the pager to the next cpu.
+ * Returns 0 if pager is done, else 1.
+ */
+static ssize_t pager_next_cpu(void)
+{
+        if (pager.cpu <= CPU_MAX) {
+                pager.cpu += 1;
+                return 1;
+        }
+        return 0;
+}
+/* pager_next gets the next page of data from the ring buffer
+ * of the current cpu. Returns page size or 0 if no more data.
+ */
+static ssize_t pager_next(void)
+{
+        int ret;
+        if (pager.cpu > CPU_MAX)
+                return 0;
+        if (!pager.spare)
+                pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer, pager.cpu);
+        if (!pager.spare) {
+                printk(TAG "ring_buffer_alloc_read_page failed!");
+                return -ENOMEM;
+        }
+        ret = ring_buffer_read_page(pager.tr->buffer,
+                                    &pager.spare,
+                                    pager.len,
+                                    pager.cpu, 0);
+        if (ret < 0)
+                return 0;
+        return PAGE_SIZE;
+}
+static int pager_deinit(void)
+{
+        int cpu;
+        if (pager.spare != NULL)
+                ring_buffer_free_read_page(pager.tr->buffer, pager.spare);
+        for_each_tracing_cpu(cpu) {
+                atomic_dec(&iter.tr->data[cpu]->disabled);
+        }
+        return 0;
+}
+/* cmdline_next gets the next saved cmdline from the trace and
+ * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty.
+ * but will reset itself on a subsequent call.
+ */
+static ssize_t cmdline_next(void)
+{
+        static int pid;
+        ssize_t size = 0;
+        if (pid >= PID_MAX_DEFAULT)
+                pid = -1;
+        while (size == 0 && pid < PID_MAX_DEFAULT) {
+                pid++;
+                trace_find_cmdline(pid, cmdline_buf);
+                if (!strncmp(cmdline_buf, "<...>", 5))
+                        continue;
+                sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d"
+                                     CMDLINE_DELIM, pid);
+                size = strlen(cmdline_buf);
+        }
+        return size;
+}
+/* comsume_events removes the first 'num' entries from the ring buffer. */
+static int consume_events(size_t num)
+{
+        TRY(iter_init());
+        for (; num > 0 && !trace_empty(&iter); num--) {
+                trace_find_next_entry_inc(&iter);
+                ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts,
+                                    &iter.lost_events);
+        }
+        TRY(iter_deinit());
+        return 0;
+}
+static int data_init(void)
+{
+        if (current_format)
+                TRY(iter_init());
+        else
+                TRY(pager_init());
+        return 0;
+}
+/* data_next will figure out the right 'next' function to
+ * call and will select the right buffer to pass back
+ * to compress_next.
+ *
+ * iter_next should be used to get data entry-by-entry, ordered
+ * by time, which is what we need in order to convert it to ascii.
+ *
+ * pager_next will return a full page of raw data at a time, one
+ * CPU at a time. pager_next_cpu must be called to get the next CPU.
+ * cmdline_next will get the next saved cmdline
+ */
+static ssize_t data_next(const char **buf)
+{
+        ssize_t size;
+        if (current_format) {
+                TRY(size = iter_next());
+                *buf = iter.seq.buffer;
+        } else {
+                TRY(size = pager_next());
+                *buf = pager.spare;
+                if (size == 0) {
+                        if (pager_next_cpu()) {
+                                size = sizeof(CPU_DELIM);
+                                *buf = CPU_DELIM;
+                        } else {
+                                TRY(size = cmdline_next());
+                                *buf = cmdline_buf;
+                        }
+                }
+        }
+        return size;
+}
+static int data_deinit(void)
+{
+        if (current_format)
+                TRY(iter_deinit());
+        else
+                TRY(pager_deinit());
+        return 0;
+}
+static int compress_init(void)
+{
+        int workspacesize, ret;
+        compress_done = 0;
+        flush = Z_NO_FLUSH;
+        stream.data_type = current_format ? Z_ASCII : Z_BINARY;
+        workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL);
+        stream.workspace = vmalloc(workspacesize);
+        if (!stream.workspace) {
+                printk(TAG "Could not allocate "
+                           "enough memory for zlib!\n");
+                return -ENOMEM;
+        }
+        memset(stream.workspace, 0, workspacesize);
+        ret = zlib_deflateInit(&stream, compress_level);
+        if (ret != Z_OK) {
+                printk(TAG "%s\n", stream.msg);
+                return ret;
+        }
+        stream.avail_in = 0;
+        stream.avail_out = 0;
+        TRY(data_init());
+        return 0;
+}
+/* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes
+ * of data into the output buffer. It gets the data by calling data_next.
+ * It will return the most data it possibly can. If it returns 0, then
+ * there is no more data.
+ *
+ * By the way that zlib works, each call to zlib_deflate will possibly
+ * consume up to avail_in bytes from next_in, and will fill up to
+ * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take
+ * any more input. It will output until it is finished, and will return
+ * Z_STREAM_END.
+ */
+static ssize_t compress_next(size_t max_out)
+{
+        ssize_t ret;
+        max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK);
+        stream.next_out = out_buf;
+        stream.avail_out = max_out;
+        while (stream.avail_out > 0 && !compress_done) {
+                if (stream.avail_in == 0 && flush != Z_FINISH) {
+                        TRY(stream.avail_in =
+                            data_next((const char **)&stream.next_in));
+                        flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH;
+                }
+                if (stream.next_in != NULL) {
+                        TRYM((ret = zlib_deflate(&stream, flush)),
+                             "zlib: %s\n", stream.msg);
+                        compress_done = (ret == Z_STREAM_END);
+                }
+        }
+        ret = max_out - stream.avail_out;
+        return ret;
+}
+static int compress_deinit(void)
+{
+        TRY(data_deinit());
+        zlib_deflateEnd(&stream);
+        vfree(stream.workspace);
+        /* TODO: remove */
+        printk(TAG "Total in: %ld\n", stream.total_in);
+        printk(TAG "Total out: %ld\n", stream.total_out);
+        return stream.total_out;
+}
+static int compress_reset(void)
+{
+        TRY(compress_deinit());
+        TRY(compress_init());
+        return 0;
+}
+/* tracedump_init initializes all tracedump components.
+ * Call this before tracedump_next
+ */
+int tracedump_init(void)
+{
+        TRY(compress_init());
+        return 0;
+}
+/* tracedump_next will print up to max_out data from the tracing ring
+ * buffers using the print function selected by print_to. The data is
+ * compressed using zlib.
+ *
+ * The output type of the data is specified by the format_ascii module
+ * parameter. If format_ascii == 1, human-readable data will be output.
+ * Otherwise, it will output raw data from the ring buffer in cpu order,
+ * followed by the saved_cmdlines data.
+ */
+ssize_t tracedump_next(size_t max_out, int print_to)
+{
+        ssize_t size;
+        TRY(size = compress_next(max_out));
+        print(out_buf, size, print_to);
+        return size;
+}
+/* tracedump_all will print all data in the tracing ring buffers using
+ * the print function selected by print_to. The data is compressed using
+ * zlib, and is surrounded by MAGIC_NUMBER.
+ *
+ * The output type of the data is specified by the format_ascii module
+ * parameter. If format_ascii == 1, human-readable data will be output.
+ * Otherwise, it will output raw data from the ring buffer in cpu order,
+ * followed by the saved_cmdlines data.
+ */
+ssize_t tracedump_all(int print_to)
+{
+        ssize_t ret, size = 0;
+        TRY(size += print_magic(print_to));
+        do {
+                /* Here the size used doesn't really matter,
+                 * since we're dumping everything. */
+                TRY(ret = tracedump_next(0xFFFFFFFF, print_to));
+                size += ret;
+        } while (ret > 0);
+        TRY(size += print_magic(print_to));
+        return size;
+}
+/* tracedump_deinit deinitializes all tracedump components.
+ * This must be called, even on error.
+ */
+int tracedump_deinit(void)
+{
+        TRY(compress_deinit());
+        return 0;
+}
+/* tracedump_reset reinitializes all tracedump components. */
+int tracedump_reset(void)
+{
+        TRY(compress_reset());
+        return 0;
+}
+/* tracedump_open opens the tracedump file for reading. */
+static int tracedump_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        mutex_lock(&tracedump_proc_lock);
+        current_format = format_ascii;
+        ret = tracedump_init();
+        if (ret < 0)
+                goto err;
+        ret = nonseekable_open(inode, file);
+        if (ret < 0)
+                goto err;
+        return ret;
+err:
+        mutex_unlock(&tracedump_proc_lock);
+        return ret;
+}
+/* tracedump_read will reads data from tracedump_next and prints
+ * it to userspace. It will surround the data with MAGIC_NUMBER.
+ */
+static ssize_t tracedump_read(struct file *file, char __user *buf,
+                              size_t len, loff_t *offset)
+{
+        static int done;
+        ssize_t size = 0;
+        pager.ubuf = buf;
+        if (*offset == 0) {
+                done = 0;
+                TRY(size = print_magic(TD_PRINT_USER));
+        } else if (!done) {
+                TRY(size = tracedump_next(len, TD_PRINT_USER));
+                if (size == 0) {
+                        TRY(size = print_magic(TD_PRINT_USER));
+                        done = 1;
+                }
+        }
+        *offset += size;
+        return size;
+}
+static int tracedump_release(struct inode *inode, struct file *file)
+{
+        int ret;
+        ret = tracedump_deinit();
+        mutex_unlock(&tracedump_proc_lock);
+        return ret;
+}
+/* tracedump_dump dumps all tracing data from the tracing ring buffers
+ * to all consoles. For details about the output format, see
+ * tracedump_all.
+ * At most max_out bytes are dumped. To accomplish this,
+ * tracedump_dump calls tracedump_all several times without writing the data,
+ * each time tossing out old data until it reaches its goal.
+ *
+ * Note: dumping raw pages currently does NOT follow the size limit.
+ */
+int tracedump_dump(size_t max_out)
+{
+        ssize_t size;
+        size_t consume;
+        printk(TAG "\n");
+        tracedump_init();
+        if (format_ascii) {
+                size = tracedump_all(TD_NO_PRINT);
+                if (size < 0) {
+                        printk(TAG "failed to dump\n");
+                        goto out;
+                }
+                while (size > max_out) {
+                        TRY(tracedump_deinit());
+                        /* Events take more or less 60 ascii bytes each,
+                           not counting compression */
+                        consume = TD_MIN_CONSUME + (size - max_out) /
+                                        (60 / (compress_level + 1));
+                        TRY(consume_events(consume));
+                        TRY(tracedump_init());
+                        size = tracedump_all(TD_NO_PRINT);
+                        if (size < 0) {
+                                printk(TAG "failed to dump\n");
+                                goto out;
+                        }
+                }
+                TRY(tracedump_reset());
+        }
+        size = tracedump_all(TD_PRINT_CONSOLE);
+        if (size < 0) {
+                printk(TAG "failed to dump\n");
+                goto out;
+        }
+out:
+        tracedump_deinit();
+        printk(KERN_INFO "\n" TAG " end\n");
+        return size;
+}
+static const struct file_operations tracedump_fops = {
+        .owner = THIS_MODULE,
+        .open = tracedump_open,
+        .read = tracedump_read,
+        .release = tracedump_release,
+};
+#ifdef CONFIG_TRACEDUMP_PANIC
+static int tracedump_panic_handler(struct notifier_block *this,
+                                   unsigned long event, void *unused)
+{
+        tracedump_dump(panic_size);
+        return 0;
+}
+static struct notifier_block tracedump_panic_notifier = {
+        .notifier_call  = tracedump_panic_handler,
+        .next           = NULL,
+        .priority       = 150   /* priority: INT_MAX >= x >= 0 */
+};
+#endif
+static int __init tracedump_initcall(void)
+{
+#ifdef CONFIG_TRACEDUMP_PROCFS
+        struct proc_dir_entry *entry;
+        /* Create a procfs file for easy dumping */
+        entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL);
+        if (!entry)
+                printk(TAG "failed to create proc entry\n");
+        else
+                entry->proc_fops = &tracedump_fops;
+#endif
+#ifdef CONFIG_TRACEDUMP_PANIC
+        /* Automatically dump to console on a kernel panic */
+        atomic_notifier_chain_register(&panic_notifier_list,
+                                       &tracedump_panic_notifier);
+#endif
+        return 0;
+}
+early_initcall(tracedump_initcall);
diff --git a/kernel/trace/tracelevel.c b/kernel/trace/tracelevel.c
new file mode 100644
index 00000000000..9f8b8eedbb5
--- /dev/null
+++ b/kernel/trace/tracelevel.c
@@ -0,0 +1,142 @@
+/*
+ * kernel/trace/tracelevel.c
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/ftrace_event.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/tracelevel.h>
+#include <linux/vmalloc.h>
+#include "trace.h"
+#define TAG KERN_ERR "tracelevel: "
+struct tracelevel_record {
+        struct list_head list;
+        char *name;
+        int level;
+};
+static LIST_HEAD(tracelevel_list);
+static bool started;
+static unsigned int tracelevel_level = TRACELEVEL_DEFAULT;
+static DEFINE_MUTEX(tracelevel_record_lock);
+/* tracelevel_set_event sets a single event if set = 1, or
+ * clears an event if set = 0.
+ */
+static int tracelevel_set_event(struct tracelevel_record *evt, bool set)
+{
+        if (trace_set_clr_event(NULL, evt->name, set) < 0) {
+                printk(TAG "failed to set event %s\n", evt->name);
+                return -EINVAL;
+        }
+        return 0;
+}
+/* Registers an event. If possible, it also sets it.
+ * If not, we'll set it in tracelevel_init.
+ */
+int __tracelevel_register(char *name, unsigned int level)
+{
+        struct tracelevel_record *evt = (struct tracelevel_record *)
+                vmalloc(sizeof(struct tracelevel_record));
+        if (!evt) {
+                printk(TAG "failed to allocate tracelevel_record for %s\n",
+                        name);
+                return -ENOMEM;
+        }
+        evt->name = name;
+        evt->level = level;
+        mutex_lock(&tracelevel_record_lock);
+        list_add(&evt->list, &tracelevel_list);
+        mutex_unlock(&tracelevel_record_lock);
+        if (level >= tracelevel_level && started)
+                tracelevel_set_event(evt, 1);
+        return 0;
+}
+/* tracelevel_set_level sets the global level, clears events
+ * lower than that level, and enables events greater or equal.
+ */
+int tracelevel_set_level(int level)
+{
+        struct tracelevel_record *evt = NULL;
+        if (level < 0 || level > TRACELEVEL_MAX)
+                return -EINVAL;
+        tracelevel_level = level;
+        mutex_lock(&tracelevel_record_lock);
+        list_for_each_entry(evt, &tracelevel_list, list) {
+                if (evt->level >= level)
+                        tracelevel_set_event(evt, 1);
+                else
+                        tracelevel_set_event(evt, 0);
+        }
+        mutex_unlock(&tracelevel_record_lock);
+        return 0;
+}
+static int param_set_level(const char *val, const struct kernel_param *kp)
+{
+        int level, ret;
+        ret = strict_strtol(val, 0, &level);
+        if (ret < 0)
+                return ret;
+        return tracelevel_set_level(level);
+}
+static int param_get_level(char *buffer, const struct kernel_param *kp)
+{
+        return param_get_int(buffer, kp);
+}
+static struct kernel_param_ops tracelevel_level_ops = {
+        .set = param_set_level,
+        .get = param_get_level
+};
+module_param_cb(level, &tracelevel_level_ops, &tracelevel_level, 0644);
+/* Turn on the tracing that has been registered thus far. */
+static int __init tracelevel_init(void)
+{
+        int ret;
+        started = true;
+        /* Ring buffer is initialize to 1 page until the user sets a tracer.
+         * Since we're doing this manually, we need to ask for expanded buffer.
+         */
+        ret = tracing_update_buffers();
+        if (ret < 0)
+                return ret;
+        return tracelevel_set_level(tracelevel_level);
+}
+/* Tracing mechanism is set up during fs_initcall. */
+fs_initcall_sync(tracelevel_init);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 24dc60d9fa1..5bbfac85866 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 #define KB 1024
 #define MB (1024*KB)
+#define KB_MASK (~(KB-1))
 /*
 * fill in extended accounting fields
 */
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
                stats->hiwater_vm    = get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB;
                mmput(mm);
        }
-        stats->read_char        = p->ioac.rchar;
+        stats->read_char        = p->ioac.rchar & KB_MASK;
-        stats->write_char       = p->ioac.wchar;
+        stats->write_char       = p->ioac.wchar & KB_MASK;
-        stats->read_syscalls    = p->ioac.syscr;
+        stats->read_syscalls    = p->ioac.syscr & KB_MASK;
-        stats->write_syscalls   = p->ioac.syscw;
+        stats->write_syscalls   = p->ioac.syscw & KB_MASK;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        stats->read_bytes       = p->ioac.read_bytes;
+        stats->read_bytes       = p->ioac.read_bytes & KB_MASK;
-        stats->write_bytes      = p->ioac.write_bytes;
+        stats->write_bytes      = p->ioac.write_bytes & KB_MASK;
-        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
 #else
        stats->read_bytes       = 0;
        stats->write_bytes      = 0;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56ad479..36491cd5b7d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -200,6 +200,7 @@ static int is_softlockup(unsigned long touch_ts)
 }
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
        .config         = PERF_COUNT_HW_CPU_CYCLES,
@@ -209,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
 };
 /* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event,
                 struct perf_sample_data *data,
                 struct pt_regs *regs)
 {
@@ -368,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
        if (event != NULL)
                goto out_enable;
-        /* Try to register using hardware perf events */
        wd_attr = &wd_hw_attr;
        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
+        /* Try to register using hardware perf events */
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
        if (!IS_ERR(event)) {
                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
                goto out_save;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0400553f0d0..1783aabc612 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
 * per-CPU workqueues:
 */
 struct workqueue_struct {
-        unsigned int            flags;          /* I: WQ_* flags */
+        unsigned int            flags;          /* W: WQ_* flags */
        union {
                struct cpu_workqueue_struct __percpu    *pcpu;
                struct cpu_workqueue_struct             *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
        mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
        struct worker           *rescuer;       /* I: rescue worker */
+        int                     nr_drainers;    /* W: drain in progress */
        int                     saved_max_active; /* W: saved cwq max_active */
        const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
        /* if dying, only works from the same workqueue are allowed */
-        if (unlikely(wq->flags & WQ_DYING) &&
+        if (unlikely(wq->flags & WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -2381,6 +2382,59 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * drain_workqueue - drain a workqueue
+ * @wq: workqueue to drain
+ *
+ * Wait until the workqueue becomes empty.  While draining is in progress,
+ * only chain queueing is allowed.  IOW, only currently pending or running
+ * work items on @wq can queue further work items on it.  @wq is flushed
+ * repeatedly until it becomes empty.  The number of flushing is detemined
+ * by the depth of chaining and should be relatively short.  Whine if it
+ * takes too long.
+ */
+void drain_workqueue(struct workqueue_struct *wq)
+{
+        unsigned int flush_cnt = 0;
+        unsigned int cpu;
+        /*
+         * __queue_work() needs to test whether there are drainers, is much
+         * hotter than drain_workqueue() and already looks at @wq->flags.
+         * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+         */
+        spin_lock(&workqueue_lock);
+        if (!wq->nr_drainers++)
+                wq->flags |= WQ_DRAINING;
+        spin_unlock(&workqueue_lock);
+reflush:
+        flush_workqueue(wq);
+        for_each_cwq_cpu(cpu, wq) {
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                bool drained;
+                spin_lock_irq(&cwq->gcwq->lock);
+                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
+                spin_unlock_irq(&cwq->gcwq->lock);
+                if (drained)
+                        continue;
+                if (++flush_cnt == 10 ||
+                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+                        pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
+                                   wq->name, flush_cnt);
+                goto reflush;
+        }
+        spin_lock(&workqueue_lock);
+        if (!--wq->nr_drainers)
+                wq->flags &= ~WQ_DRAINING;
+        spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(drain_workqueue);
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool wait_executing)
 {
@@ -3009,34 +3063,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-        unsigned int flush_cnt = 0;
        unsigned int cpu;
-        /*
+        /* drain it before proceeding with destruction */
-         * Mark @wq dying and drain all pending works.  Once WQ_DYING is
+        drain_workqueue(wq);
-         * set, only chain queueing is allowed.  IOW, only currently
-         * pending or running work items on @wq can queue further work
-         * items on it.  @wq is flushed repeatedly until it becomes empty.
-         * The number of flushing is detemined by the depth of chaining and
-         * should be relatively short.  Whine if it takes too long.
-         */
-        wq->flags |= WQ_DYING;
-reflush:
-        flush_workqueue(wq);
-        for_each_cwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
-                        continue;
-                if (++flush_cnt == 10 ||
-                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-                        printk(KERN_WARNING "workqueue %s: flush on "
-                               "destruction isn't complete after %u tries\n",
-                               wq->name, flush_cnt);
-                goto reflush;
-        }
        /*
         * wq list is used to freeze wq, remove from list after