Merge tag 'v3.13' into for-3.15

Linux 3.13 Conflicts: include/net/xfrm.h Simple merge where v3.13 removed 'extern' from definitions and the audit tree did s/u32/unsigned int/ to the same definitions.
author: Eric Paris <eparis@redhat.com> 2014-03-07 11:41:32 -0500
committer: Eric Paris <eparis@redhat.com> 2014-03-07 11:41:32 -0500
commit: b7d3622a39fde7658170b7f3cf6c6889bb8db30d (patch)
tree: 64f4e781ecb2a85d675e234072b988560bcd25f1 /kernel
parent: f3411cb2b2e396a41ed3a439863f028db7140a34 (diff)
parent: d8ec26d7f8287f5788a494f56e8814210f0e64be (diff)
147 files changed, 6384 insertions, 2588 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..790d83c7d160 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,3 +5,4 @@ config_data.h
 config_data.gz
 timeconst.h
 hz.bc
+x509_certificate_list
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b03..2a202a846757 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 config SCHED_HRTICK
-        def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+        def_bool HIGH_RES_TIMERS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1ce47553fb02..bc010ee272b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,56 +6,44 @@ obj-y     = fork.o exec_domain.o panic.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
-            rcupdate.o extable.o params.o posix-timers.o \
+            extable.o params.o posix-timers.o \
-            kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
+            kthread.o sys_ni.o posix-cpu-timers.o \
-            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+            hrtimer.o nsproxy.o \
            notifier.o ksysfs.o cred.o reboot.o \
-            async.o range.o groups.o lglock.o smpboot.o
+            async.o range.o groups.o smpboot.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
 obj-y += sched/
+obj-y += locking/
 obj-y += power/
 obj-y += printk/
 obj-y += cpu/
 obj-y += irq/
+obj-y += rcu/
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
-obj-$(CONFIG_LOCKDEP) += lockdep.o
-ifeq ($(CONFIG_PROC_FS),y)
-obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
-endif
 obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -81,12 +69,6 @@ obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_TINY_RCU) += rcutiny.o
-obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -141,19 +123,53 @@ targets += timeconst.h
 $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
        $(call if_changed,bc)
-ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
 #
-# Pull the signing certificate and any extra certificates into the kernel
+# We look in the source root and the build root for all files whose name ends
+# in ".x509".  Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
 #
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+                                $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
-quiet_cmd_touch = TOUCH   $@
+ifneq ($(wildcard $(obj)/.x509.list),)
-      cmd_touch = touch   $@
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+kernel/system_certificates.o: $(obj)/x509_certificate_list
+quiet_cmd_x509certs  = CERTS   $@
+      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo "  - Including cert $(X509)")
-extra_certificates:
+targets += $(obj)/x509_certificate_list
-        $(call cmd,touch)
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+        $(call if_changed,x509certs)
-kernel/modsign_certificate.o: signing_key.x509 extra_certificates
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+        @echo $(X509_CERTIFICATES) >$@
+endif
+clean-files := x509_certificate_list .x509.list
+ifeq ($(CONFIG_MODULE_SIG),y)
 ###############################################################################
 #
 # If module signing is requested, say by allyesconfig, but a key has not been
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..9fd4246b04b8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,8 @@
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
 #include <linux/page_cgroup.h>
+#include <linux/log2.h>
+#include <linux/spinlock_types.h>
 void foo(void)
 {
@@ -17,5 +19,9 @@ void foo(void)
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
+        DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
        /* End of constants */
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfdc70d7..bc1dcabe9217 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
 static DEFINE_MUTEX(cgroup_root_mutex);
 /*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+/*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
 * populated with the built in subsystems, and modular subsystems are
 * registered after that. The mutable section of this array is protected by
@@ -125,38 +133,6 @@ struct cfent {
 };
 /*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
- */
-#define CSS_ID_MAX      (65535)
-struct css_id {
-        /*
-         * The css to which this ID points. This pointer is set to valid value
-         * after cgroup is populated. If cgroup is removed, this will be NULL.
-         * This pointer is expected to be RCU-safe because destroy()
-         * is called after synchronize_rcu(). But for safe use, css_tryget()
-         * should be used for avoiding race.
-         */
-        struct cgroup_subsys_state __rcu *css;
-        /*
-         * ID of this css.
-         */
-        unsigned short id;
-        /*
-         * Depth in hierarchy which this ID belongs to.
-         */
-        unsigned short depth;
-        /*
-         * ID is freed by RCU. (and lookup routine is RCU safe.)
-         */
-        struct rcu_head rcu_head;
-        /*
-         * Hierarchy of CSS ID belongs to.
-         */
-        unsigned short stack[0]; /* Array of Length (depth+1) */
-};
-/*
 * cgroup_event represents events which userspace want to receive.
 */
 struct cgroup_event {
@@ -223,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
 /**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -387,9 +364,6 @@ struct cgrp_cset_link {
 static struct css_set init_css_set;
 static struct cgrp_cset_link init_cgrp_cset_link;
-static int cgroup_init_idr(struct cgroup_subsys *ss,
-                           struct cgroup_subsys_state *css);
 /*
 * css_set_lock protects the list of css_set objects, and the chain of
 * tasks off each css_set.  Nests outside task->alloc_lock due to
@@ -841,8 +815,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
-static int alloc_css_id(struct cgroup_subsys_state *child_css);
 static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -908,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-        schedule_work(&cgrp->destroy_work);
+        queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -918,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                struct cgroup *cgrp = dentry->d_fsdata;
                BUG_ON(!(cgroup_is_dead(cgrp)));
+                /*
+                 * XXX: cgrp->id is only used to look up css's.  As cgroup
+                 * and css's lifetimes will be decoupled, it should be made
+                 * per-subsystem and moved to css->id so that lookups are
+                 * successful until the target css is released.
+                 */
+                idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+                cgrp->id = -1;
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@ -932,11 +914,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
-static int cgroup_delete(const struct dentry *d)
-{
-        return 1;
-}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -1523,7 +1500,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
 {
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
-                .d_delete = cgroup_delete,
+                .d_delete = always_delete_dentry,
        };
        struct inode *inode =
@@ -2463,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
        .write = cgroup_file_write,
        .llseek = seq_lseek,
-        .release = single_release,
+        .release = cgroup_file_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2524,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
                ret = cft->release(inode, file);
        if (css->ss)
                css_put(css);
+        if (file->f_op == &cgroup_seqfile_operations)
+                single_release(inode, file);
        return ret;
 }
@@ -4240,21 +4219,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
                                goto err;
                }
        }
-        /* This cgroup is ready now */
-        for_each_root_subsys(cgrp->root, ss) {
-                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-                struct css_id *id = rcu_dereference_protected(css->id, true);
-                /*
-                 * Update id->css pointer and make this css visible from
-                 * CSS ID functions. This pointer will be dereferened
-                 * from RCU-read-side without locks.
-                 */
-                if (id)
-                        rcu_assign_pointer(id->css, css);
-        }
        return 0;
 err:
        cgroup_clear_dir(cgrp, subsys_mask);
@@ -4306,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
         * css_put().  dput() requires process context which we don't have.
         */
        INIT_WORK(&css->destroy_work, css_free_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
@@ -4314,6 +4278,7 @@ static void css_release(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
+        rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
        call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
@@ -4323,7 +4288,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
        css->cgroup = cgrp;
        css->ss = ss;
        css->flags = 0;
-        css->id = NULL;
        if (cgrp->parent)
                css->parent = cgroup_css(cgrp->parent, ss);
@@ -4455,12 +4419,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                        goto err_free_all;
                init_css(css, ss, cgrp);
-                if (ss->use_id) {
-                        err = alloc_css_id(css);
-                        if (err)
-                                goto err_free_all;
-                }
        }
        /*
@@ -4479,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
-        /* each css holds a ref to the cgroup's dentry and the parent css */
-        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-                dget(dentry);
-                css_get(css->parent);
-        }
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
@@ -4498,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                if (err)
                        goto err_destroy;
+                /* each css holds a ref to the cgroup's dentry and parent css */
+                dget(dentry);
+                css_get(css->parent);
+                /* mark it consumed for error path */
+                css_ar[ss->subsys_id] = NULL;
                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
                    parent->parent) {
                        pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4544,6 +4501,14 @@ err_free_cgrp:
        return err;
 err_destroy:
+        for_each_root_subsys(root, ss) {
+                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
+                if (css) {
+                        percpu_ref_cancel_init(&css->refcnt);
+                        ss->css_free(css);
+                }
+        }
        cgroup_destroy_locked(cgrp);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4603,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
                container_of(ref, struct cgroup_subsys_state, refcnt);
        INIT_WORK(&css->destroy_work, css_killed_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
@@ -4705,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         * will be invoked to perform the rest of destruction once the
         * percpu refs of all css's are confirmed to be killed.
         */
-        for_each_root_subsys(cgrp->root, ss)
+        for_each_root_subsys(cgrp->root, ss) {
-                kill_css(cgroup_css(cgrp, ss));
+                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+                if (css)
+                        kill_css(css);
+        }
        /*
         * Mark @cgrp dead.  This prevents further task migration and child
@@ -4775,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
        /* delete this cgroup from parent->children */
        list_del_rcu(&cgrp->sibling);
-        /*
-         * We should remove the cgroup object from idr before its grace
-         * period starts, so we won't be looking up a cgroup while the
-         * cgroup is being freed.
-         */
-        idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-        cgrp->id = -1;
        dput(d);
        set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -4925,12 +4886,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        /* our new subsystem will be attached to the dummy hierarchy. */
        init_css(css, ss, cgroup_dummy_top);
-        /* init_idr must be after init_css() because it sets css->id. */
-        if (ss->use_id) {
-                ret = cgroup_init_idr(ss, css);
-                if (ret)
-                        goto err_unload;
-        }
        /*
         * Now we need to entangle the css into the existing css_sets. unlike
@@ -4996,9 +4951,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        offline_css(cgroup_css(cgroup_dummy_top, ss));
-        if (ss->use_id)
-                idr_destroy(&ss->idr);
        /* deassign the subsys_id */
        cgroup_subsys[ss->subsys_id] = NULL;
@@ -5025,8 +4977,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        /*
         * remove subsystem's css from the cgroup_dummy_top and free it -
         * need to free before marking as null because ss->css_free needs
-         * the cgrp->subsys pointer to find their state. note that this
+         * the cgrp->subsys pointer to find their state.
-         * also takes care of freeing the css_id.
         */
        ss->css_free(cgroup_css(cgroup_dummy_top, ss));
        RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@ -5097,8 +5048,6 @@ int __init cgroup_init(void)
        for_each_builtin_subsys(ss, i) {
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
-                if (ss->use_id)
-                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* allocate id for the dummy hierarchy */
@@ -5139,6 +5088,22 @@ out:
        return err;
 }
+static int __init cgroup_wq_init(void)
+{
+        /*
+         * There isn't much point in executing destruction path in
+         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+         * Use 1 for @max_active.
+         *
+         * We would prefer to do this in cgroup_init() above, but that
+         * is called before init_workqueues(): so leave this until after.
+         */
+        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+        BUG_ON(!cgroup_destroy_wq);
+        return 0;
+}
+core_initcall(cgroup_wq_init);
 /*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5518,181 +5483,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-/* to get ID other than 0, this should be called when !cgroup_is_dead() */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
-        struct css_id *cssid;
-        /*
-         * This css_id() can return correct value when somone has refcnt
-         * on this or this is under rcu_read_lock(). Once css->id is allocated,
-         * it's unchanged until freed.
-         */
-        cssid = rcu_dereference_raw(css->id);
-        if (cssid)
-                return cssid->id;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(css_id);
-/**
- *  css_is_ancestor - test "root" css is an ancestor of "child"
- * @child: the css to be tested.
- * @root: the css supporsed to be an ancestor of the child.
- *
- * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, the caller must hold rcu_read_lock().
- * But, considering usual usage, the csses should be valid objects after test.
- * Assuming that the caller will do some action to the child if this returns
- * returns true, the caller must take "child";s reference count.
- * If "child" is valid object and this returns true, "root" is valid, too.
- */
-bool css_is_ancestor(struct cgroup_subsys_state *child,
-                    const struct cgroup_subsys_state *root)
-{
-        struct css_id *child_id;
-        struct css_id *root_id;
-        child_id  = rcu_dereference(child->id);
-        if (!child_id)
-                return false;
-        root_id = rcu_dereference(root->id);
-        if (!root_id)
-                return false;
-        if (child_id->depth < root_id->depth)
-                return false;
-        if (child_id->stack[root_id->depth] != root_id->id)
-                return false;
-        return true;
-}
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
-        struct css_id *id = rcu_dereference_protected(css->id, true);
-        /* When this is called before css_id initialization, id can be NULL */
-        if (!id)
-                return;
-        BUG_ON(!ss->use_id);
-        rcu_assign_pointer(id->css, NULL);
-        rcu_assign_pointer(css->id, NULL);
-        spin_lock(&ss->id_lock);
-        idr_remove(&ss->idr, id->id);
-        spin_unlock(&ss->id_lock);
-        kfree_rcu(id, rcu_head);
-}
-EXPORT_SYMBOL_GPL(free_css_id);
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
- */
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
-{
-        struct css_id *newid;
-        int ret, size;
-        BUG_ON(!ss->use_id);
-        size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
-        newid = kzalloc(size, GFP_KERNEL);
-        if (!newid)
-                return ERR_PTR(-ENOMEM);
-        idr_preload(GFP_KERNEL);
-        spin_lock(&ss->id_lock);
-        /* Don't use 0. allocates an ID of 1-65535 */
-        ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
-        spin_unlock(&ss->id_lock);
-        idr_preload_end();
-        /* Returns error when there are no free spaces for new ID.*/
-        if (ret < 0)
-                goto err_out;
-        newid->id = ret;
-        newid->depth = depth;
-        return newid;
-err_out:
-        kfree(newid);
-        return ERR_PTR(ret);
-}
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
-                                            struct cgroup_subsys_state *rootcss)
-{
-        struct css_id *newid;
-        spin_lock_init(&ss->id_lock);
-        idr_init(&ss->idr);
-        newid = get_new_cssid(ss, 0);
-        if (IS_ERR(newid))
-                return PTR_ERR(newid);
-        newid->stack[0] = newid->id;
-        RCU_INIT_POINTER(newid->css, rootcss);
-        RCU_INIT_POINTER(rootcss->id, newid);
-        return 0;
-}
-static int alloc_css_id(struct cgroup_subsys_state *child_css)
-{
-        struct cgroup_subsys_state *parent_css = css_parent(child_css);
-        struct css_id *child_id, *parent_id;
-        int i, depth;
-        parent_id = rcu_dereference_protected(parent_css->id, true);
-        depth = parent_id->depth + 1;
-        child_id = get_new_cssid(child_css->ss, depth);
-        if (IS_ERR(child_id))
-                return PTR_ERR(child_id);
-        for (i = 0; i < depth; i++)
-                child_id->stack[i] = parent_id->stack[i];
-        child_id->stack[depth] = child_id->id;
-        /*
-         * child_id->css pointer will be set after this cgroup is available
-         * see cgroup_populate_dir()
-         */
-        rcu_assign_pointer(child_css->id, child_id);
-        return 0;
-}
-/**
- * css_lookup - lookup css by id
- * @ss: cgroup subsys to be looked into.
- * @id: the id
- *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
- * NULL if not. Should be called under rcu_read_lock()
- */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
-{
-        struct css_id *cssid = NULL;
-        BUG_ON(!ss->use_id);
-        cssid = idr_find(&ss->idr, id);
-        if (unlikely(!cssid))
-                return NULL;
-        return rcu_dereference(cssid->css);
-}
-EXPORT_SYMBOL_GPL(css_lookup);
 /**
 * css_from_dir - get corresponding css from the dentry of a cgroup dir
 * @dentry: directory dentry of interest
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 859c8dfd78a1..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
 * instead of preempt_schedule() to exit user context if needed before
 * calling the scheduler.
 */
-void __sched notrace preempt_schedule_context(void)
+asmlinkage void __sched notrace preempt_schedule_context(void)
 {
        enum ctx_state prev_ctx;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..deff2e693766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,8 +306,28 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                                __func__, cpu);
                goto out_release;
        }
+        /*
+         * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+         * and RCU users of this state to go away such that all new such users
+         * will observe it.
+         *
+         * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+         * not imply sync_sched(), so explicitly call both.
+         *
+         * Do sync before park smpboot threads to take care the rcu boost case.
+         */
+#ifdef CONFIG_PREEMPT
+        synchronize_sched();
+#endif
+        synchronize_rcu();
        smpboot_park_threads(cpu);
+        /*
+         * So now all preempt/rcu users must observe !cpu_active().
+         */
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
@@ -420,11 +440,6 @@ int cpu_up(unsigned int cpu)
 {
        int err = 0;
-#ifdef  CONFIG_MEMORY_HOTPLUG
-        int nid;
-        pg_data_t       *pgdat;
-#endif
        if (!cpu_possible(cpu)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
@@ -435,27 +450,9 @@ int cpu_up(unsigned int cpu)
                return -EINVAL;
        }
-#ifdef  CONFIG_MEMORY_HOTPLUG
+        err = try_online_node(cpu_to_node(cpu));
-        nid = cpu_to_node(cpu);
+        if (err)
-        if (!node_online(nid)) {
+                return err;
-                err = mem_online_node(nid);
-                if (err)
-                        return err;
-        }
-        pgdat = NODE_DATA(nid);
-        if (!pgdat) {
-                printk(KERN_ERR
-                        "Can't online cpu %d due to NULL pgdat\n", cpu);
-                return -ENOMEM;
-        }
-        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
-                mutex_lock(&zonelists_mutex);
-                build_all_zonelists(NULL, NULL);
-                mutex_unlock(&zonelists_mutex);
-        }
-#endif
        cpu_maps_update_begin();
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-        while (!need_resched())
+        while (!tif_need_resched())
                cpu_relax();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
                        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
                                cpu_idle_poll();
                        } else {
-                                current_clr_polling();
+                                if (!current_clr_polling_and_test()) {
-                                if (!need_resched()) {
                                        stop_critical_timings();
                                        rcu_idle_enter();
                                        arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
                                } else {
                                        local_irq_enable();
                                }
-                                current_set_polling();
+                                __current_set_polling();
                        }
                        arch_cpu_idle_exit();
+                        /*
+                         * We need to test and propagate the TIF_NEED_RESCHED
+                         * bit here because we might not have send the
+                         * reschedule IPI to idle tasks.
+                         */
+                        if (tif_need_resched())
+                                set_preempt_need_resched();
                }
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
         */
        boot_init_stack_canary();
 #endif
-        current_set_polling();
+        __current_set_polling();
        arch_cpu_idle_prepare();
        cpu_idle_loop();
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4772034b4b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        if (need_loop)
+        if (need_loop) {
+                local_irq_disable();
                write_seqcount_begin(&tsk->mems_allowed_seq);
+        }
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
-        if (need_loop)
+        if (need_loop) {
                write_seqcount_end(&tsk->mems_allowed_seq);
+                local_irq_enable();
+        }
        task_unlock(tsk);
 }
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..7d2f35e5df2f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
                raw_spin_lock(&dbg_slave_lock);
 #ifdef CONFIG_SMP
+        /* If send_ready set, slaves are already waiting */
+        if (ks->send_ready)
+                atomic_set(ks->send_ready, 1);
        /* Signal the other CPUs to enter kgdb_wait() */
-        if ((!kgdb_single_step) && kgdb_do_roundup)
+        else if ((!kgdb_single_step) && kgdb_do_roundup)
                kgdb_roundup_cpus(flags);
 #endif
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        if (arch_kgdb_ops.enable_nmi)
                arch_kgdb_ops.enable_nmi(0);
+        memset(ks, 0, sizeof(struct kgdb_state));
        ks->cpu                 = raw_smp_processor_id();
        ks->ex_vector           = evector;
        ks->signo               = signo;
        ks->err_code            = ecode;
-        ks->kgdb_usethreadid    = 0;
        ks->linux_regs          = regs;
        if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
        return 1;
 }
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+        if (!kgdb_io_ready(0) || !send_ready)
+                return 1;
+        if (kgdb_info[cpu].enter_kgdb == 0) {
+                struct kgdb_state kgdb_var;
+                struct kgdb_state *ks = &kgdb_var;
+                memset(ks, 0, sizeof(struct kgdb_state));
+                ks->cpu                 = cpu;
+                ks->ex_vector           = trapnr;
+                ks->signo               = SIGTRAP;
+                ks->err_code            = KGDB_KDB_REASON_SYSTEM_NMI;
+                ks->linux_regs          = regs;
+                ks->send_ready          = send_ready;
+                kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+                return 0;
+        }
+#endif
+        return 1;
+}
 static void kgdb_console_write(struct console *co, const char *s,
   unsigned count)
 {
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 2235967e78b0..572aa4f5677c 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
        unsigned long           threadid;
        long                    kgdb_usethreadid;
        struct pt_regs          *linux_regs;
+        atomic_t                *send_ready;
 };
 /* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
 extern int kdb_parse(const char *cmdstr);
 extern int kdb_common_init_state(struct kgdb_state *ks);
 extern int kdb_common_deinit_state(void);
+#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
 #else /* ! CONFIG_KGDB_KDB */
 static inline int kdb_stub(struct kgdb_state *ks)
 {
        return DBG_PASS_EVENT;
 }
+#define KGDB_KDB_REASON_SYSTEM_NMI 0
 #endif /* CONFIG_KGDB_KDB */
 #endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 328d18ef31e4..8859ca34dcfe 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
        if (atomic_read(&kgdb_setting_breakpoint))
                reason = KDB_REASON_KEYBOARD;
-        if (in_nmi())
+        if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+                reason = KDB_REASON_SYSTEM_NMI;
+        else if (in_nmi())
                reason = KDB_REASON_NMI;
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 00eb8f7fbf41..0b097c8a1e50 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                           instruction_pointer(regs));
                kdb_dumpregs(regs);
                break;
+        case KDB_REASON_SYSTEM_NMI:
+                kdb_printf("due to System NonMaskable Interrupt\n");
+                break;
        case KDB_REASON_NMI:
                kdb_printf("due to NonMaskable Interrupt @ "
                           kdb_machreg_fmt "\n",
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d473988c1d0b..54996b71e66d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        struct timespec ts;
        cputime_t utime, stime, stimescaled, utimescaled;
-        /* Though tsk->delays accessed later, early exit avoids
-         * unnecessary returning of other data
-         */
-        if (!tsk->delays)
-                goto done;
        tmp = (s64)d->cpu_run_real_total;
        task_cputime(tsk, &utime, &stime);
        cputime_to_timespec(utime + stime, &ts);
@@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        d->freepages_count += tsk->delays->freepages_count;
        spin_unlock_irqrestore(&tsk->delays->lock, flags);
-done:
        return 0;
 }
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915efef66d..e556751d15d9 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
 #include <linux/elf.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/binfmts.h>
-#include <asm/elf.h>
 Elf_Half __weak elf_core_extra_phdrs(void)
 {
        return 0;
 }
-int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-                                      unsigned long limit)
 {
        return 1;
 }
-int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
-                                     unsigned long limit)
 {
        return 1;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 953c14348375..f5744010a8d2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
-static atomic_t perf_sample_allowed_ns __read_mostly =
+static int perf_sample_allowed_ns __read_mostly =
-        ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 void update_perf_cpu_limits(void)
 {
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
        tmp *= sysctl_perf_cpu_time_max_percent;
        do_div(tmp, 100);
-        atomic_set(&perf_sample_allowed_ns, tmp);
+        ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 }
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 * we detect that events are taking too long.
 */
 #define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
+static DEFINE_PER_CPU(u64, running_sample_length);
 void perf_sample_event_took(u64 sample_len_ns)
 {
        u64 avg_local_sample_len;
        u64 local_samples_len;
+        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-        if (atomic_read(&perf_sample_allowed_ns) == 0)
+        if (allowed_ns == 0)
                return;
        /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
         */
        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-        if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+        if (avg_local_sample_len <= allowed_ns)
                return;
        if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        printk_ratelimited(KERN_WARNING
-                        "perf samples too long (%lld > %d), lowering "
+                        "perf samples too long (%lld > %lld), lowering "
                        "kernel.perf_event_max_sample_rate to %d\n",
-                        avg_local_sample_len,
+                        avg_local_sample_len, allowed_ns,
-                        atomic_read(&perf_sample_allowed_ns),
                        sysctl_perf_event_sample_rate);
        update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
                put_ctx(ctx->parent_ctx);
                ctx->parent_ctx = NULL;
        }
+        ctx->generation++;
 }
 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
+        ctx->generation++;
 }
 /*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);
+        if (sample_type & PERF_SAMPLE_TRANSACTION)
+                size += sizeof(data->txn);
        event->header_size = size;
 }
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+        ctx->generation++;
 }
 static void perf_group_detach(struct perf_event *event)
@@ -1388,6 +1396,8 @@ event_sched_out(struct perf_event *event,
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
+        perf_pmu_disable(event->pmu);
        event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
@@ -1404,6 +1414,8 @@ event_sched_out(struct perf_event *event,
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
+        perf_pmu_enable(event->pmu);
 }
 static void
@@ -1644,6 +1656,7 @@ event_sched_in(struct perf_event *event,
                 struct perf_event_context *ctx)
 {
        u64 tstamp = perf_event_time(event);
+        int ret = 0;
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
@@ -1666,10 +1679,13 @@ event_sched_in(struct perf_event *event,
         */
        smp_wmb();
+        perf_pmu_disable(event->pmu);
        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
-                return -EAGAIN;
+                ret = -EAGAIN;
+                goto out;
        }
        event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1685,7 +1701,10 @@ event_sched_in(struct perf_event *event,
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
-        return 0;
+out:
+        perf_pmu_enable(event->pmu);
+        return ret;
 }
 static int
@@ -2146,22 +2165,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 }
 /*
- * Test whether two contexts are equivalent, i.e. whether they
+ * Test whether two contexts are equivalent, i.e. whether they have both been
- * have both been cloned from the same version of the same context
+ * cloned from the same version of the same context.
- * and they both have the same number of enabled events.
+ *
- * If the number of enabled events is the same, then the set
+ * Equivalence is measured using a generation number in the context that is
- * of enabled events should be the same, because these are both
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
- * inherited contexts, therefore we can't access individual events
+ * and list_del_event().
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
 */
 static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
 {
-        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+        /* Pinning disables the swap optimization */
-                && ctx1->parent_gen == ctx2->parent_gen
+        if (ctx1->pin_count || ctx2->pin_count)
-                && !ctx1->pin_count && !ctx2->pin_count;
+                return 0;
+        /* If ctx1 is the parent of ctx2 */
+        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+                return 1;
+        /* If ctx2 is the parent of ctx1 */
+        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+                return 1;
+        /*
+         * If ctx1 and ctx2 have the same parent; we flatten the parent
+         * hierarchy, see perf_event_init_context().
+         */
+        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+                        ctx1->parent_gen == ctx2->parent_gen)
+                return 1;
+        /* Unmatched */
+        return 0;
 }
 static void __perf_event_sync_stat(struct perf_event *event,
@@ -2210,9 +2245,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
        perf_event_update_userpage(next_event);
 }
-#define list_next_entry(pos, member) \
-        list_entry(pos->member.next, typeof(*pos), member)
 static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
 {
@@ -2244,7 +2276,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 {
        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
        struct perf_event_context *next_ctx;
-        struct perf_event_context *parent;
+        struct perf_event_context *parent, *next_parent;
        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
@@ -2256,10 +2288,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                return;
        rcu_read_lock();
-        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp[ctxn];
-        if (parent && next_ctx &&
+        if (!next_ctx)
-            rcu_dereference(next_ctx->parent_ctx) == parent) {
+                goto unlock;
+        parent = rcu_dereference(ctx->parent_ctx);
+        next_parent = rcu_dereference(next_ctx->parent_ctx);
+        /* If neither context have a parent context; they cannot be clones. */
+        if (!parent && !next_parent)
+                goto unlock;
+        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
@@ -2287,6 +2327,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
+unlock:
        rcu_read_unlock();
        if (do_switch) {
@@ -2713,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                perf_pmu_disable(event->pmu);
                hwc = &event->hw;
                if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -2722,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                }
                if (!event->attr.freq || !event->attr.sample_freq)
-                        continue;
+                        goto next;
                /*
                 * stop the event and update event->count
@@ -2744,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                        perf_adjust_period(event, period, delta, false);
                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+        next:
+                perf_pmu_enable(event->pmu);
        }
        perf_pmu_enable(ctx->pmu);
@@ -4572,6 +4617,9 @@ void perf_output_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);
+        if (sample_type & PERF_SAMPLE_TRANSACTION)
+                perf_output_put(handle, data->txn);
        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;
@@ -5100,27 +5148,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
-        const char *name;
+        char *name;
-        memset(tmp, 0, sizeof(tmp));
        if (file) {
                struct inode *inode;
                dev_t dev;
+                buf = kmalloc(PATH_MAX, GFP_KERNEL);
+                if (!buf) {
+                        name = "//enomem";
+                        goto cpy_name;
+                }
                /*
-                 * d_path works from the end of the rb backwards, so we
+                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
-                buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+                name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
-                if (!buf) {
-                        name = strncpy(tmp, "//enomem", sizeof(tmp));
-                        goto got_name;
-                }
-                name = d_path(&file->f_path, buf, PATH_MAX);
                if (IS_ERR(name)) {
-                        name = strncpy(tmp, "//toolong", sizeof(tmp));
+                        name = "//toolong";
-                        goto got_name;
+                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
@@ -5128,34 +5175,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);
+                goto got_name;
        } else {
-                if (arch_vma_name(mmap_event->vma)) {
+                name = (char *)arch_vma_name(vma);
-                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                if (name)
-                                       sizeof(tmp) - 1);
+                        goto cpy_name;
-                        tmp[sizeof(tmp) - 1] = '\0';
-                        goto got_name;
-                }
-                if (!vma->vm_mm) {
+                if (vma->vm_start <= vma->vm_mm->start_brk &&
-                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
-                        goto got_name;
-                } else if (vma->vm_start <= vma->vm_mm->start_brk &&
                                vma->vm_end >= vma->vm_mm->brk) {
-                        name = strncpy(tmp, "[heap]", sizeof(tmp));
+                        name = "[heap]";
-                        goto got_name;
+                        goto cpy_name;
-                } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+                }
+                if (vma->vm_start <= vma->vm_mm->start_stack &&
                                vma->vm_end >= vma->vm_mm->start_stack) {
-                        name = strncpy(tmp, "[stack]", sizeof(tmp));
+                        name = "[stack]";
-                        goto got_name;
+                        goto cpy_name;
                }
-                name = strncpy(tmp, "//anon", sizeof(tmp));
+                name = "//anon";
-                goto got_name;
+                goto cpy_name;
        }
+cpy_name:
+        strlcpy(tmp, name, sizeof(tmp));
+        name = tmp;
 got_name:
-        size = ALIGN(strlen(name)+1, sizeof(u64));
+        /*
+         * Since our buffer works in 8 byte units we need to align our string
+         * size to a multiple of 8. However, we must guarantee the tail end is
+         * zero'd out to avoid leaking random bits to userspace.
+         */
+        size = strlen(name)+1;
+        while (!IS_ALIGNED(size, sizeof(u64)))
+                name[size++] = '\0';
        mmap_event->file_name = name;
        mmap_event->file_size = size;
@@ -5643,11 +5695,6 @@ static void swevent_hlist_put(struct perf_event *event)
 {
        int cpu;
-        if (event->cpu != -1) {
-                swevent_hlist_put_cpu(event, event->cpu);
-                return;
-        }
        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(event, cpu);
 }
@@ -5681,9 +5728,6 @@ static int swevent_hlist_get(struct perf_event *event)
        int err;
        int cpu, failed_cpu;
-        if (event->cpu != -1)
-                return swevent_hlist_get_cpu(event, event->cpu);
        get_online_cpus();
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(event, cpu);
@@ -6292,6 +6336,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
+static DEVICE_ATTR_RO(type);
 static ssize_t
 perf_event_mux_interval_ms_show(struct device *dev,
@@ -6336,17 +6381,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
        return count;
 }
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static struct device_attribute pmu_dev_attrs[] = {
+static struct attribute *pmu_dev_attrs[] = {
-        __ATTR_RO(type),
+        &dev_attr_type.attr,
-        __ATTR_RW(perf_event_mux_interval_ms),
+        &dev_attr_perf_event_mux_interval_ms.attr,
-        __ATTR_NULL,
+        NULL,
 };
+ATTRIBUTE_GROUPS(pmu_dev);
 static int pmu_bus_running;
 static struct bus_type pmu_bus = {
        .name           = "event_source",
-        .dev_attrs      = pmu_dev_attrs,
+        .dev_groups     = pmu_dev_groups,
 };
 static void pmu_dev_release(struct device *dev)
@@ -7126,7 +7173,6 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        perf_install_in_context(ctx, event, event->cpu);
-        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
@@ -7209,7 +7255,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
-        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
 }
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
-static inline unsigned int                                              \
+static inline unsigned long                                             \
 func_name(struct perf_output_handle *handle,                            \
-          const void *buf, unsigned int len)                            \
+          const void *buf, unsigned long len)                           \
 {                                                                       \
        unsigned long size, written;                                    \
                                                                        \
        do {                                                            \
-                size = min_t(unsigned long, handle->size, len);         \
+                size    = min(handle->size, len);                       \
-                                                                        \
                written = memcpy_func(handle->addr, buf, size);         \
+                written = size - written;                               \
                                                                        \
                len -= written;                                         \
                handle->addr += written;                                \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle,				\
        return len;                                                     \
 }
-static inline int memcpy_common(void *dst, const void *src, size_t n)
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
 {
        memcpy(dst, src, n);
-        return n;
+        return 0;
 }
 DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+        return 0;
+}
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
 #ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+        unsigned long ret;
+        pagefault_disable();
+        ret = __copy_from_user_inatomic(dst, src, n);
+        pagefault_enable();
+        return ret;
+}
 #endif
 DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 9c2ddfbf4525..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
 #include <linux/perf_event.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/circ_buf.h>
 #include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
-                              unsigned long offset, unsigned long head)
-{
-        unsigned long sz = perf_data_size(rb);
-        unsigned long mask = sz - 1;
-        /*
-         * check if user-writable
-         * overwrite : over-write its own tail
-         * !overwrite: buffer possibly drops events.
-         */
-        if (rb->overwrite)
-                return true;
-        /*
-         * verify that payload is not bigger than buffer
-         * otherwise masking logic may fail to detect
-         * the "not enough space" condition
-         */
-        if ((head - offset) > sz)
-                return false;
-        offset = (offset - tail) & mask;
-        head   = (head   - tail) & mask;
-        if ((int)(head - offset) < 0)
-                return false;
-        return true;
-}
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
        atomic_set(&handle->rb->poll, POLL_IN);
@@ -115,8 +85,8 @@ again:
        rb->user_page->data_head = head;
        /*
-         * Now check if we missed an update, rely on the (compiler)
+         * Now check if we missed an update -- rely on previous implied
-         * barrier in atomic_dec_and_test() to re-read rb->head.
+         * compiler barriers to force a re-read.
         */
        if (unlikely(head != local_read(&rb->head))) {
                local_inc(&rb->nest);
@@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 {
        struct ring_buffer *rb;
        unsigned long tail, offset, head;
-        int have_lost;
+        int have_lost, page_shift;
-        struct perf_sample_data sample_data;
        struct {
                struct perf_event_header header;
                u64                      id;
@@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
                event = event->parent;
        rb = rcu_dereference(event->rb);
-        if (!rb)
+        if (unlikely(!rb))
                goto out;
-        handle->rb      = rb;
+        if (unlikely(!rb->nr_pages))
-        handle->event   = event;
-        if (!rb->nr_pages)
                goto out;
+        handle->rb    = rb;
+        handle->event = event;
        have_lost = local_read(&rb->lost);
-        if (have_lost) {
+        if (unlikely(have_lost)) {
-                lost_event.header.size = sizeof(lost_event);
+                size += sizeof(lost_event);
-                perf_event_header__init_id(&lost_event.header, &sample_data,
+                if (event->attr.sample_id_all)
-                                           event);
+                        size += event->id_header_size;
-                size += lost_event.header.size;
        }
        perf_output_get_handle(handle);
        do {
-                /*
-                 * Userspace could choose to issue a mb() before updating the
-                 * tail pointer. So that all reads will be completed before the
-                 * write is issued.
-                 *
-                 * See perf_output_put_handle().
-                 */
                tail = ACCESS_ONCE(rb->user_page->data_tail);
-                smp_mb();
                offset = head = local_read(&rb->head);
-                head += size;
+                if (!rb->overwrite &&
-                if (unlikely(!perf_output_space(rb, tail, offset, head)))
+                    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
                        goto fail;
+                head += size;
        } while (local_cmpxchg(&rb->head, offset, head) != offset);
-        if (head - local_read(&rb->wakeup) > rb->watermark)
+        /*
+         * Separate the userpage->tail read from the data stores below.
+         * Matches the MB userspace SHOULD issue after reading the data
+         * and before storing the new tail position.
+         *
+         * See perf_output_put_handle().
+         */
+        smp_mb();
+        if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
                local_add(rb->watermark, &rb->wakeup);
-        handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+        page_shift = PAGE_SHIFT + page_order(rb);
-        handle->page &= rb->nr_pages - 1;
-        handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
-        handle->addr = rb->data_pages[handle->page];
-        handle->addr += handle->size;
-        handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
-        if (have_lost) {
+        handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+        offset &= (1UL << page_shift) - 1;
+        handle->addr = rb->data_pages[handle->page] + offset;
+        handle->size = (1UL << page_shift) - offset;
+        if (unlikely(have_lost)) {
+                struct perf_sample_data sample_data;
+                lost_event.header.size = sizeof(lost_event);
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.id          = event->id;
                lost_event.lost        = local_xchg(&rb->lost, 0);
+                perf_event_header__init_id(&lost_event.header,
+                                           &sample_data, event);
                perf_output_put(handle, lost_event);
                perf_event__output_id_sample(event, handle, &sample_data);
        }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bdca70e..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
 #include <linux/kdebug.h>       /* notifier mechanism */
 #include "../../mm/internal.h"  /* munlock_vma_page */
 #include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
 #include <linux/uprobes.h>
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
- * have fixed length instructions.
+ * that have fixed length instructions.
 */
 /*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode - write the opcode at a given virtual address.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * For mm @mm, write the opcode at @vaddr.
 * Return 0 (success) or a negative errno.
 */
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
                        uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
 */
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+        return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
 }
 /**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 }
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
        return ret;
 }
-static int
+static int __copy_insn(struct address_space *mapping, struct file *filp,
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
+                        void *insn, int nbytes, loff_t offset)
-                        unsigned long nbytes, loff_t offset)
 {
        struct page *page;
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
-        struct address_space *mapping;
+        struct address_space *mapping = uprobe->inode->i_mapping;
-        unsigned long nbytes;
+        loff_t offs = uprobe->offset;
-        int bytes;
+        void *insn = uprobe->arch.insn;
+        int size = MAX_UINSN_BYTES;
-        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
+        int len, err = -EIO;
-        mapping = uprobe->inode->i_mapping;
-        /* Instruction at end of binary; copy only available bytes */
+        /* Copy only available bytes, -EIO if nothing was read */
-        if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+        do {
-                bytes = uprobe->inode->i_size - uprobe->offset;
+                if (offs >= i_size_read(uprobe->inode))
-        else
+                        break;
-                bytes = MAX_UINSN_BYTES;
-        /* Instruction at the page-boundary; copy bytes in second page */
+                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
-        if (nbytes < bytes) {
+                err = __copy_insn(mapping, filp, insn, len, offs);
-                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                                bytes - nbytes, uprobe->offset + nbytes);
                if (err)
-                        return err;
+                        break;
-                bytes = nbytes;
-        }
+                insn += len;
-        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+                offs += len;
+                size -= len;
+        } while (size);
+        return err;
 }
 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        if (ret)
                goto out;
-        /* write_opcode() assumes we don't cross page boundary */
+        /* uprobe_write_opcode() assumes we don't cross page boundary */
        BUG_ON((uprobe->offset & ~PAGE_MASK) +
                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 }
 /* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 {
-        struct mm_struct *mm = current->mm;
        int ret = -EALREADY;
        down_write(&mm->mmap_sem);
        if (mm->uprobes_state.xol_area)
                goto fail;
-        ret = -ENOMEM;
+        if (!area->vaddr) {
-        /* Try to map as high as possible, this is only a hint. */
+                /* Try to map as high as possible, this is only a hint. */
-        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
-        if (area->vaddr & ~PAGE_MASK) {
+                                                PAGE_SIZE, 0, 0);
-                ret = area->vaddr;
+                if (area->vaddr & ~PAGE_MASK) {
-                goto fail;
+                        ret = area->vaddr;
+                        goto fail;
+                }
        }
        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
-        ret = 0;
 fail:
        up_write(&mm->mmap_sem);
        return ret;
 }
-/*
+static struct xol_area *__create_xol_area(unsigned long vaddr)
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
 {
        struct mm_struct *mm = current->mm;
-        struct xol_area *area;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+        struct xol_area *area;
-        area = mm->uprobes_state.xol_area;
+        area = kmalloc(sizeof(*area), GFP_KERNEL);
-        if (area)
-                goto ret;
-        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
        if (!area->page)
                goto free_bitmap;
-        /* allocate first slot of task's xol_area for the return probes */
+        area->vaddr = vaddr;
+        init_waitqueue_head(&area->wq);
+        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
-        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
        atomic_set(&area->slot_count, 1);
-        init_waitqueue_head(&area->wq);
+        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
-        if (!xol_add_vma(area))
+        if (!xol_add_vma(mm, area))
                return area;
        __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
 free_area:
        kfree(area);
 out:
+        return NULL;
+}
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+        struct mm_struct *mm = current->mm;
+        struct xol_area *area;
+        if (!mm->uprobes_state.xol_area)
+                __create_xol_area(0);
        area = mm->uprobes_state.xol_area;
- ret:
+        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
        return area;
 }
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
                return 0;
        /* Initialize the slot */
-        copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
+        copy_to_page(area->page, xol_vaddr,
+                        uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
        /*
         * We probably need flush_icache_user_range() but it needs vma.
         * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
 }
 /*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
-        t->utask = NULL;
-}
-/*
 * Allocate a uprobe_task object for the task if if necessary.
 * Called when the thread hits a breakpoint.
 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
        return current->utask;
 }
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+        struct uprobe_task *n_utask;
+        struct return_instance **p, *o, *n;
+        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+        if (!n_utask)
+                return -ENOMEM;
+        t->utask = n_utask;
+        p = &n_utask->return_instances;
+        for (o = o_utask->return_instances; o; o = o->next) {
+                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+                if (!n)
+                        return -ENOMEM;
+                *n = *o;
+                atomic_inc(&n->uprobe->ref);
+                n->next = NULL;
+                *p = n;
+                p = &n->next;
+                n_utask->depth++;
+        }
+        return 0;
+}
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+        pr_warn("uprobe: %s:%d failed to %s\n",
+                        current->comm, current->pid, msg);
+}
+static void dup_xol_work(struct callback_head *work)
+{
+        kfree(work);
+        if (current->flags & PF_EXITING)
+                return;
+        if (!__create_xol_area(current->utask->vaddr))
+                uprobe_warn(current, "dup xol area");
+}
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+        struct uprobe_task *utask = current->utask;
+        struct mm_struct *mm = current->mm;
+        struct callback_head *work;
+        struct xol_area *area;
+        t->utask = NULL;
+        if (!utask || !utask->return_instances)
+                return;
+        if (mm == t->mm && !(flags & CLONE_VFORK))
+                return;
+        if (dup_utask(t, utask))
+                return uprobe_warn(t, "dup ret instances");
+        /* The task can fork() after dup_xol_work() fails */
+        area = mm->uprobes_state.xol_area;
+        if (!area)
+                return uprobe_warn(t, "dup xol area");
+        if (mm == t->mm)
+                return;
+        /* TODO: move it into the union in uprobe_task */
+        work = kmalloc(sizeof(*work), GFP_KERNEL);
+        if (!work)
+                return uprobe_warn(t, "dup xol area");
+        t->utask->vaddr = area->vaddr;
+        init_task_work(work, dup_xol_work);
+        task_work_add(t, work, true);
+}
 /*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
        return register_die_notifier(&uprobe_exception_nb);
 }
-module_init(init_uprobes);
+__initcall(init_uprobes);
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 static inline int init_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_sinittext &&
-            addr <= (unsigned long)_einittext)
+            addr < (unsigned long)_einittext)
                return 1;
        return 0;
 }
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
 int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
-            addr <= (unsigned long)_etext)
+            addr < (unsigned long)_etext)
                return 1;
        if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..dfa736c98d17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,11 +532,12 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        mm->flags = (current->mm) ?
                (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
        mm->core_state = NULL;
-        mm->nr_ptes = 0;
+        atomic_long_set(&mm->nr_ptes, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        clear_tlb_flush_pending(mm);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -560,7 +561,7 @@ static void check_mm(struct mm_struct *mm)
                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
        }
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON(mm->pmd_huge_pte);
 #endif
 }
@@ -814,12 +815,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
        mm_init_cpumask(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-        mm->first_nid = NUMA_PTE_SCAN_INIT;
-#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -1174,7 +1172,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * do not allow it to share a thread group or signal handlers or
         * parent with the forking task.
         */
-        if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
+        if (clone_flags & CLONE_SIGHAND) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) !=
                                current->nsproxy->pid_ns_for_children))
@@ -1313,7 +1311,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        /* Perform scheduler related setup. Assign this task to a CPU. */
-        sched_fork(p);
+        sched_fork(clone_flags, p);
        retval = perf_event_init_task(p);
        if (retval)
@@ -1373,7 +1371,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
-        uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1490,6 +1487,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        perf_event_fork(p);
        trace_task_newtask(p, clone_flags);
+        uprobe_copy_process(p, clone_flags);
        return p;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b462fa197517..aa6a8aadb911 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
 bool pm_freezing;
 bool pm_nosig_freezing;
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
 /* protects freezing and frozen transitions */
 static DEFINE_SPINLOCK(freezer_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c3a1a55a5214..f6ff0191ecf7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -66,7 +66,7 @@
 #include <asm/futex.h>
-#include "rtmutex_common.h"
+#include "locking/rtmutex_common.h"
 int __read_mostly futex_cmpxchg_enabled;
@@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
                return -EINVAL;
        address -= key->both.offset;
+        if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+                return -EFAULT;
        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
@@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
                get_futex_key_refs(key);
@@ -288,7 +289,7 @@ again:
                put_page(page);
                /* serialize against __split_huge_page_splitting() */
                local_irq_disable();
-                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
                        page_head = compound_head(page);
                        /*
                         * page_head is valid pointer but we must pin
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d4da55d1fb65..d04ce8ac4399 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL
        larger and run slower. Also be sure to exclude files from profiling
        which are not linked to the kernel image to prevent linker errors.
+choice
+        prompt "Specify GCOV format"
+        depends on GCOV_KERNEL
+        default GCOV_FORMAT_AUTODETECT
+        ---help---
+        The gcov format is usually determined by the GCC version, but there are
+        exceptions where format changes are integrated in lower-version GCCs.
+        In such a case use this option to adjust the format used in the kernel
+        accordingly.
+        If unsure, choose "Autodetect".
+config GCOV_FORMAT_AUTODETECT
+        bool "Autodetect"
+        ---help---
+        Select this option to use the format that corresponds to your GCC
+        version.
+config GCOV_FORMAT_3_4
+        bool "GCC 3.4 format"
+        ---help---
+        Select this option to use the format defined by GCC 3.4.
+config GCOV_FORMAT_4_7
+        bool "GCC 4.7 format"
+        ---help---
+        Select this option to use the format defined by GCC 4.7.
+endchoice
 endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index e97ca59e2520..52aa7e8de927 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,33 @@
 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
+# if-lt
+# Usage VAR := $(call if-lt, $(a), $(b))
+# Returns 1 if (a < b)
+if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
+ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
+  cc-ver := 0304
+else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
+  cc-ver := 0407
+else
+# Use cc-version if available, otherwise set 0
+#
+# scripts/Kbuild.include, which contains cc-version function, is not included
+# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
+# Meaning cc-ver is empty causing if-lt test to fail with
+# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
+# This has no affect on the clean phase, but the error message could be
+# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
+# is not available. We can probably move if-lt to Kbuild.include, so it's also
+# not defined during clean or to include Kbuild.include in
+# scripts/Makefile.clean. But the following workaround seems least invasive.
+  cc-ver := $(if $(call cc-version),$(call cc-version),0)
+endif
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
+ifeq ($(call if-lt, $(cc-ver), 0407),1)
+  obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
+else
+  obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
+endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9b22d03cc581..f45b75b713c0 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,7 +20,6 @@
 #include <linux/mutex.h>
 #include "gcov.h"
-static struct gcov_info *gcov_info_head;
 static int gcov_events_enabled;
 static DEFINE_MUTEX(gcov_lock);
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)
        mutex_lock(&gcov_lock);
        if (gcov_version == 0) {
-                gcov_version = info->version;
+                gcov_version = gcov_info_version(info);
                /*
                 * Printing gcc's version magic may prove useful for debugging
                 * incompatibility reports.
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)
         * Add new profiling data structure to list and inform event
         * listener.
         */
-        info->next = gcov_info_head;
+        gcov_info_link(info);
-        gcov_info_head = info;
        if (gcov_events_enabled)
                gcov_event(GCOV_ADD, info);
        mutex_unlock(&gcov_lock);
@@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
 }
 EXPORT_SYMBOL(__gcov_merge_delta);
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
 /**
 * gcov_enable_events - enable event reporting through gcov_event()
 *
@@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);
 */
 void gcov_enable_events(void)
 {
-        struct gcov_info *info;
+        struct gcov_info *info = NULL;
        mutex_lock(&gcov_lock);
        gcov_events_enabled = 1;
        /* Perform event callback for previously registered entries. */
-        for (info = gcov_info_head; info; info = info->next)
+        while ((info = gcov_info_next(info)))
                gcov_event(GCOV_ADD, info);
        mutex_unlock(&gcov_lock);
 }
@@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
                                void *data)
 {
        struct module *mod = data;
-        struct gcov_info *info;
+        struct gcov_info *info = NULL;
-        struct gcov_info *prev;
+        struct gcov_info *prev = NULL;
        if (event != MODULE_STATE_GOING)
                return NOTIFY_OK;
        mutex_lock(&gcov_lock);
-        prev = NULL;
        /* Remove entries located in module from linked list. */
-        for (info = gcov_info_head; info; info = info->next) {
+        while ((info = gcov_info_next(info))) {
                if (within(info, mod->module_core, mod->core_size)) {
-                        if (prev)
+                        gcov_info_unlink(prev, info);
-                                prev->next = info->next;
-                        else
-                                gcov_info_head = info->next;
                        if (gcov_events_enabled)
                                gcov_event(GCOV_REMOVE, info);
                } else
                        prev = info;
        }
        mutex_unlock(&gcov_lock);
        return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 7a7d2ee96d42..15ff01a76379 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str)
        unsigned long val;
        if (kstrtoul(str, 0, &val)) {
-                pr_warning("invalid gcov_persist parameter '%s'\n", str);
+                pr_warn("invalid gcov_persist parameter '%s'\n", str);
                return 0;
        }
        gcov_persist = val;
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)
        list_for_each_entry(node, &all_head, all) {
                info = get_node_info(node);
-                if (info && (strcmp(info->filename, name) == 0))
+                if (info && (strcmp(gcov_info_filename(info), name) == 0))
                        return node;
        }
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
        seq = file->private_data;
        info = gcov_iter_get_info(seq->private);
        mutex_lock(&node_lock);
-        node = get_node_by_name(info->filename);
+        node = get_node_by_name(gcov_info_filename(info));
        if (node) {
                /* Reset counts or remove node for unloaded modules. */
                if (node->num_loaded == 0)
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename)
 */
 static void add_links(struct gcov_node *node, struct dentry *parent)
 {
-        char *basename;
+        const char *basename;
        char *target;
        int num;
        int i;
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
        if (!node->links)
                return;
        for (i = 0; i < num; i++) {
-                target = get_link_target(get_node_info(node)->filename,
+                target = get_link_target(
-                                         &gcov_link[i]);
+                                gcov_info_filename(get_node_info(node)),
+                                &gcov_link[i]);
                if (!target)
                        goto out_err;
-                basename = strrchr(target, '/');
+                basename = kbasename(target);
-                if (!basename)
+                if (basename == target)
                        goto out_err;
-                basename++;
                node->links[i] = debugfs_create_symlink(deskew(basename),
                                                        parent, target);
                if (!node->links[i])
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
        } else
                node->dentry = debugfs_create_dir(node->name, parent->dentry);
        if (!node->dentry) {
-                pr_warning("could not create file\n");
+                pr_warn("could not create file\n");
                kfree(node);
                return NULL;
        }
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
 err_nomem:
        kfree(node);
-        pr_warning("out of memory\n");
+        pr_warn("out of memory\n");
        return NULL;
 }
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)
        struct gcov_node *parent;
        struct gcov_node *node;
-        filename = kstrdup(info->filename, GFP_KERNEL);
+        filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
        if (!filename)
                return;
        parent = &root_node;
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
         */
        loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
        if (!loaded_info) {
-                pr_warning("could not add '%s' (out of memory)\n",
+                pr_warn("could not add '%s' (out of memory)\n",
-                           info->filename);
+                        gcov_info_filename(info));
                return;
        }
        memcpy(loaded_info, node->loaded_info,
@@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
                 * data set replaces the copy of the last one.
                 */
                if (!gcov_info_is_compatible(node->unloaded_info, info)) {
-                        pr_warning("discarding saved data for %s "
+                        pr_warn("discarding saved data for %s "
-                                   "(incompatible version)\n", info->filename);
+                                "(incompatible version)\n",
+                                gcov_info_filename(info));
                        gcov_info_free(node->unloaded_info);
                        node->unloaded_info = NULL;
                }
@@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
                 * The initial one takes precedence.
                 */
                if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
-                        pr_warning("could not add '%s' (incompatible "
+                        pr_warn("could not add '%s' (incompatible "
-                                   "version)\n", info->filename);
+                                "version)\n", gcov_info_filename(info));
                        kfree(loaded_info);
                        return;
                }
@@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)
        else {
                node->unloaded_info = gcov_info_dup(info);
                if (!node->unloaded_info) {
-                        pr_warning("could not save data for '%s' "
+                        pr_warn("could not save data for '%s' "
-                                   "(out of memory)\n", info->filename);
+                                "(out of memory)\n",
+                                gcov_info_filename(info));
                }
        }
 }
@@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)
        i = get_info_index(node, info);
        if (i < 0) {
-                pr_warning("could not remove '%s' (not found)\n",
+                pr_warn("could not remove '%s' (not found)\n",
-                           info->filename);
+                        gcov_info_filename(info));
                return;
        }
        if (gcov_persist)
@@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
        struct gcov_node *node;
        mutex_lock(&node_lock);
-        node = get_node_by_name(info->filename);
+        node = get_node_by_name(gcov_info_filename(info));
        switch (action) {
        case GCOV_ADD:
                if (node)
@@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
                if (node)
                        remove_info(node, info);
                else {
-                        pr_warning("could not remove '%s' (not found)\n",
+                        pr_warn("could not remove '%s' (not found)\n",
-                                   info->filename);
+                                gcov_info_filename(info));
                }
                break;
        }
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb4260033..27bc88a35013 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,6 +21,121 @@
 #include <linux/vmalloc.h>
 #include "gcov.h"
+#define GCOV_COUNTERS           5
+static struct gcov_info *gcov_info_head;
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+        unsigned int ident;
+        unsigned int checksum;
+        unsigned int n_ctrs[0];
+};
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+        unsigned int    num;
+        gcov_type       *values;
+        void            (*merge)(gcov_type *, unsigned int);
+};
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+        unsigned int                    version;
+        struct gcov_info                *next;
+        unsigned int                    stamp;
+        const char                      *filename;
+        unsigned int                    n_functions;
+        const struct gcov_fn_info       *functions;
+        unsigned int                    ctr_mask;
+        struct gcov_ctr_info            counts[0];
+};
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+        return info->filename;
+}
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+        return info->version;
+}
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+        if (!info)
+                return gcov_info_head;
+        return info->next;
+}
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+        info->next = gcov_info_head;
+        gcov_info_head = info;
+}
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+        if (prev)
+                prev->next = info->next;
+        else
+                gcov_info_head = info->next;
+}
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
        { OBJ_TREE, "gcno" },   /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 000000000000..2c6e4631c814
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,560 @@
+/*
+ *  This code provides functions to handle gcc's profiling data format
+ *  introduced with gcc 4.7.
+ *
+ *  This file is based heavily on gcc_3_4.c file.
+ *
+ *  For a better understanding, refer to gcc source:
+ *  gcc/gcov-io.h
+ *  libgcc/libgcov.c
+ *
+ *  Uses gcc-internal data definitions.
+ */
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+#define GCOV_COUNTERS                   8
+#define GCOV_TAG_FUNCTION_LENGTH        3
+static struct gcov_info *gcov_info_head;
+/**
+ * struct gcov_ctr_info - information about counters for a single function
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+        unsigned int num;
+        gcov_type *values;
+};
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @key: comdat key
+ * @ident: unique ident of function
+ * @lineno_checksum: function lineo_checksum
+ * @cfg_checksum: function cfg checksum
+ * @ctrs: instrumented counters
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ *
+ * Information about a single function.  This uses the trailing array
+ * idiom. The number of counters is determined from the merge pointer
+ * array in gcov_info.  The key is used to detect which of a set of
+ * comdat functions was selected -- it points to the gcov_info object
+ * of the object file containing the selected comdat function.
+ */
+struct gcov_fn_info {
+        const struct gcov_info *key;
+        unsigned int ident;
+        unsigned int lineno_checksum;
+        unsigned int cfg_checksum;
+        struct gcov_ctr_info ctrs[0];
+};
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: uniquifying time stamp
+ * @filename: name of the associated gcov data file
+ * @merge: merge functions (null for unused counter type)
+ * @n_functions: number of instrumented functions
+ * @functions: pointer to pointers to function information
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+        unsigned int version;
+        struct gcov_info *next;
+        unsigned int stamp;
+        const char *filename;
+        void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
+        unsigned int n_functions;
+        struct gcov_fn_info **functions;
+};
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+        return info->filename;
+}
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+        return info->version;
+}
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+        if (!info)
+                return gcov_info_head;
+        return info->next;
+}
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+        info->next = gcov_info_head;
+        gcov_info_head = info;
+}
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+        if (prev)
+                prev->next = info->next;
+        else
+                gcov_info_head = info->next;
+}
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+        { OBJ_TREE, "gcno" },   /* Link to .gcno file in $(objtree). */
+        { 0, NULL},
+};
+/*
+ * Determine whether a counter is active. Doesn't change at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+        return info->merge[type] ? 1 : 0;
+}
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+        unsigned int i;
+        unsigned int result = 0;
+        for (i = 0; i < GCOV_COUNTERS; i++) {
+                if (counter_active(info, i))
+                        result++;
+        }
+        return result;
+}
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+        struct gcov_ctr_info *ci_ptr;
+        unsigned int fi_idx;
+        unsigned int ct_idx;
+        for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+                ci_ptr = info->functions[fi_idx]->ctrs;
+                for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+                        if (!counter_active(info, ct_idx))
+                                continue;
+                        memset(ci_ptr->values, 0,
+                                        sizeof(gcov_type) * ci_ptr->num);
+                        ci_ptr++;
+                }
+        }
+}
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+        return (info1->stamp == info2->stamp);
+}
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+        struct gcov_ctr_info *dci_ptr;
+        struct gcov_ctr_info *sci_ptr;
+        unsigned int fi_idx;
+        unsigned int ct_idx;
+        unsigned int val_idx;
+        for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
+                dci_ptr = dst->functions[fi_idx]->ctrs;
+                sci_ptr = src->functions[fi_idx]->ctrs;
+                for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+                        if (!counter_active(src, ct_idx))
+                                continue;
+                        for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
+                                dci_ptr->values[val_idx] +=
+                                        sci_ptr->values[val_idx];
+                        dci_ptr++;
+                        sci_ptr++;
+                }
+        }
+}
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+        struct gcov_info *dup;
+        struct gcov_ctr_info *dci_ptr; /* dst counter info */
+        struct gcov_ctr_info *sci_ptr; /* src counter info */
+        unsigned int active;
+        unsigned int fi_idx; /* function info idx */
+        unsigned int ct_idx; /* counter type idx */
+        size_t fi_size; /* function info size */
+        size_t cv_size; /* counter values size */
+        dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+        if (!dup)
+                return NULL;
+        dup->next = NULL;
+        dup->filename = NULL;
+        dup->functions = NULL;
+        dup->filename = kstrdup(info->filename, GFP_KERNEL);
+        if (!dup->filename)
+                goto err_free;
+        dup->functions = kcalloc(info->n_functions,
+                                 sizeof(struct gcov_fn_info *), GFP_KERNEL);
+        if (!dup->functions)
+                goto err_free;
+        active = num_counter_active(info);
+        fi_size = sizeof(struct gcov_fn_info);
+        fi_size += sizeof(struct gcov_ctr_info) * active;
+        for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+                dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
+                if (!dup->functions[fi_idx])
+                        goto err_free;
+                *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
+                sci_ptr = info->functions[fi_idx]->ctrs;
+                dci_ptr = dup->functions[fi_idx]->ctrs;
+                for (ct_idx = 0; ct_idx < active; ct_idx++) {
+                        cv_size = sizeof(gcov_type) * sci_ptr->num;
+                        dci_ptr->values = vmalloc(cv_size);
+                        if (!dci_ptr->values)
+                                goto err_free;
+                        dci_ptr->num = sci_ptr->num;
+                        memcpy(dci_ptr->values, sci_ptr->values, cv_size);
+                        sci_ptr++;
+                        dci_ptr++;
+                }
+        }
+        return dup;
+err_free:
+        gcov_info_free(dup);
+        return NULL;
+}
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+        unsigned int active;
+        unsigned int fi_idx;
+        unsigned int ct_idx;
+        struct gcov_ctr_info *ci_ptr;
+        if (!info->functions)
+                goto free_info;
+        active = num_counter_active(info);
+        for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+                if (!info->functions[fi_idx])
+                        continue;
+                ci_ptr = info->functions[fi_idx]->ctrs;
+                for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
+                        vfree(ci_ptr->values);
+                kfree(info->functions[fi_idx]);
+        }
+free_info:
+        kfree(info->functions);
+        kfree(info->filename);
+        kfree(info);
+}
+#define ITER_STRIDE     PAGE_SIZE
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+        struct gcov_info *info;
+        void *buffer;
+        size_t size;
+        loff_t pos;
+};
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+        u32 *data;
+        if (buffer) {
+                data = buffer + off;
+                *data = v;
+        }
+        return sizeof(*data);
+}
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+        u32 *data;
+        if (buffer) {
+                data = buffer + off;
+                data[0] = (v & 0xffffffffUL);
+                data[1] = (v >> 32);
+        }
+        return sizeof(*data) * 2;
+}
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+        struct gcov_fn_info *fi_ptr;
+        struct gcov_ctr_info *ci_ptr;
+        unsigned int fi_idx;
+        unsigned int ct_idx;
+        unsigned int cv_idx;
+        size_t pos = 0;
+        /* File header. */
+        pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+        pos += store_gcov_u32(buffer, pos, info->version);
+        pos += store_gcov_u32(buffer, pos, info->stamp);
+        for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+                fi_ptr = info->functions[fi_idx];
+                /* Function record. */
+                pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+                pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+                pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+                pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
+                pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+                ci_ptr = fi_ptr->ctrs;
+                for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+                        if (!counter_active(info, ct_idx))
+                                continue;
+                        /* Counter record. */
+                        pos += store_gcov_u32(buffer, pos,
+                                              GCOV_TAG_FOR_COUNTER(ct_idx));
+                        pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+                        for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
+                                pos += store_gcov_u64(buffer, pos,
+                                                      ci_ptr->values[cv_idx]);
+                        }
+                        ci_ptr++;
+                }
+        }
+        return pos;
+}
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+        struct gcov_iterator *iter;
+        iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+        if (!iter)
+                goto err_free;
+        iter->info = info;
+        /* Dry-run to get the actual buffer size. */
+        iter->size = convert_to_gcda(NULL, info);
+        iter->buffer = vmalloc(iter->size);
+        if (!iter->buffer)
+                goto err_free;
+        convert_to_gcda(iter->buffer, info);
+        return iter;
+err_free:
+        kfree(iter);
+        return NULL;
+}
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+        vfree(iter->buffer);
+        kfree(iter);
+}
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+        return iter->info;
+}
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+        iter->pos = 0;
+}
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+        if (iter->pos < iter->size)
+                iter->pos += ITER_STRIDE;
+        if (iter->pos >= iter->size)
+                return -EINVAL;
+        return 0;
+}
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+        size_t len;
+        if (iter->pos >= iter->size)
+                return -EINVAL;
+        len = ITER_STRIDE;
+        if (iter->pos + len > iter->size)
+                len = iter->size - iter->pos;
+        seq_write(seq, iter->buffer + iter->pos, len);
+        return 0;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a6..92c8e22a29ed 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,7 +21,6 @@
 * gcc and need to be kept as close to the original definition as possible to
 * remain compatible.
 */
-#define GCOV_COUNTERS           5
 #define GCOV_DATA_MAGIC         ((unsigned int) 0x67636461)
 #define GCOV_TAG_FUNCTION       ((unsigned int) 0x01000000)
 #define GCOV_TAG_COUNTER_BASE   ((unsigned int) 0x01a10000)
@@ -34,60 +33,18 @@ typedef long gcov_type;
 typedef long long gcov_type;
 #endif
-/**
+/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
- * struct gcov_fn_info - profiling meta data per function
+ * we cannot use full definition here and they need to be placed in gcc specific
- * @ident: object file-unique function identifier
+ * implementation of gcov. This also means no direct access to the members in
- * @checksum: function checksum
+ * generic code and usage of the interface below.*/
- * @n_ctrs: number of values per counter type belonging to this function
+struct gcov_info;
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
-        unsigned int ident;
-        unsigned int checksum;
-        unsigned int n_ctrs[0];
-};
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
-        unsigned int    num;
-        gcov_type       *values;
-        void            (*merge)(gcov_type *, unsigned int);
-};
-/**
+/* Interface to access gcov_info data  */
- * struct gcov_info - profiling data per object file
+const char *gcov_info_filename(struct gcov_info *info);
- * @version: gcov version magic indicating the gcc version used for compilation
+unsigned int gcov_info_version(struct gcov_info *info);
- * @next: list head for a singly-linked list
+struct gcov_info *gcov_info_next(struct gcov_info *info);
- * @stamp: time stamp
+void gcov_info_link(struct gcov_info *info);
- * @filename: name of the associated gcov data file
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
-        unsigned int                    version;
-        struct gcov_info                *next;
-        unsigned int                    stamp;
-        const char                      *filename;
-        unsigned int                    n_functions;
-        const struct gcov_fn_info       *functions;
-        unsigned int                    ctr_mask;
-        struct gcov_ctr_info            counts[0];
-};
 /* Base interface. */
 enum gcov_action {
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3e97fb126e6b..9328b80eaf14 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,11 +16,12 @@
 #include <linux/export.h>
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
+#include <trace/events/sched.h>
 /*
 * The number of tasks checked:
 */
-unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
 /*
 * Limit number of tasks checked in a batch.
@@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
                t->last_switch_count = switch_count;
                return;
        }
+        trace_sched_process_hang(t);
        if (!sysctl_hung_task_warnings)
                return;
        sysctl_hung_task_warnings--;
@@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
        return ret;
 }
+static atomic_t reset_hung_task = ATOMIC_INIT(0);
+void reset_hung_task_detector(void)
+{
+        atomic_set(&reset_hung_task, 1);
+}
+EXPORT_SYMBOL_GPL(reset_hung_task_detector);
 /*
 * kthread which checks for tasks stuck in D state
 */
@@ -216,6 +228,9 @@ static int watchdog(void *dummy)
                while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
                        timeout = sysctl_hung_task_timeout_secs;
+                if (atomic_xchg(&reset_hung_task, 0))
+                        continue;
                check_hung_uninterruptible_tasks(timeout);
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a3bb14fbe5c6..dc04c166c54d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc)
 }
 /**
- * irq_disable - Mark interupt disabled
+ * irq_disable - Mark interrupt disabled
 * @desc:       irq descriptor which should be disabled
 *
 * If the chip does not implement the irq_disable callback, we
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 706724e9835d..cf68bb36fe58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
 }
 EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-unsigned int irq_create_of_mapping(struct device_node *controller,
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
-                                   const u32 *intspec, unsigned int intsize)
 {
        struct irq_domain *domain;
        irq_hw_number_t hwirq;
        unsigned int type = IRQ_TYPE_NONE;
        unsigned int virq;
-        domain = controller ? irq_find_host(controller) : irq_default_domain;
+        domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
        if (!domain) {
                pr_warn("no irq domain found for %s !\n",
-                        of_node_full_name(controller));
+                        of_node_full_name(irq_data->np));
                return 0;
        }
        /* If domain has no translation, then we assume interrupt line */
        if (domain->ops->xlate == NULL)
-                hwirq = intspec[0];
+                hwirq = irq_data->args[0];
        else {
-                if (domain->ops->xlate(domain, controller, intspec, intsize,
+                if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
-                                     &hwirq, &type))
+                                        irq_data->args_count, &hwirq, &type))
                        return 0;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd855a8..481a13c43b17 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 }
 /*
- * Interrupts explicitely requested as threaded interupts want to be
+ * Interrupts explicitly requested as threaded interrupts want to be
 * preemtible - many of them need to sleep and wait for slow busses to
 * complete.
 */
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                        goto out_mput;
                }
-                sched_setscheduler(t, SCHED_FIFO, &param);
+                sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
                /*
                 * We keep the reference to the task struct even if
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
                bool is_early = desc->action &&
                        desc->action->flags & IRQF_EARLY_RESUME;
-                if (is_early != want_early)
+                if (!is_early && want_early)
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f1030f18..3320b84cc60f 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,6 +14,7 @@ enum {
        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
        _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
+        _IRQ_IS_POLLED          = IRQ_IS_POLLED,
        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
 };
@@ -26,6 +27,7 @@ enum {
 #define IRQ_NOAUTOEN            GOT_YOU_MORON
 #define IRQ_NESTED_THREAD       GOT_YOU_MORON
 #define IRQ_PER_CPU_DEVID       GOT_YOU_MORON
+#define IRQ_IS_POLLED           GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK        GOT_YOU_MORON
@@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
 {
        return desc->status_use_accessors & _IRQ_NESTED_THREAD;
 }
+static inline bool irq_settings_is_polled(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_IS_POLLED;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7b5f012bde9d..a1d8cc63b56e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
        raw_spin_lock(&desc->lock);
-        /* PER_CPU and nested thread interrupts are never polled */
+        /*
-        if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
+         * PER_CPU, nested thread interrupts and interrupts explicitely
+         * marked polled are excluded from polling.
+         */
+        if (irq_settings_is_per_cpu(desc) ||
+            irq_settings_is_nested_thread(desc) ||
+            irq_settings_is_polled(desc))
                goto out;
        /*
@@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
                    irqreturn_t action_ret)
 {
-        if (desc->istate & IRQS_POLL_INPROGRESS)
+        if (desc->istate & IRQS_POLL_INPROGRESS ||
+            irq_settings_is_polled(desc))
                return;
        /* we get here again via the threaded handler */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 297a9247a3b3..9019f15deab2 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable);
 void static_key_slow_inc(struct static_key *key)
 {
+        STATIC_KEY_CHECK_USE();
        if (atomic_inc_not_zero(&key->enabled))
                return;
@@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work)
 void static_key_slow_dec(struct static_key *key)
 {
+        STATIC_KEY_CHECK_USE();
        __static_key_slow_dec(key, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
+        STATIC_KEY_CHECK_USE();
        __static_key_slow_dec(&key->key, key->timeout, &key->work);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
@@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
 void jump_label_rate_limit(struct static_key_deferred *key,
                unsigned long rl)
 {
+        STATIC_KEY_CHECK_USE();
        key->timeout = rl;
        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
 }
@@ -212,6 +216,7 @@ void __init jump_label_init(void)
                key->next = NULL;
 #endif
        }
+        static_key_initialized = true;
        jump_label_unlock();
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a74f307c5ec..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
        .name  = "Crash kernel",
@@ -921,7 +924,7 @@ static int kimage_load_segment(struct kimage *image,
 *   reinitialize them.
 *
 * - A machine specific part that includes the syscall number
- *   and the copies the image to it's final destination.  And
+ *   and then copies the image to it's final destination.  And
 *   jumps into the image at entry.
 *
 * kexec does not sync, or unmount filesystems so if you need
@@ -1675,7 +1678,9 @@ int kernel_kexec(void)
        } else
 #endif
        {
+                kexec_in_progress = true;
                kernel_restart_prepare(NULL);
+                migrate_to_reboot_cpu();
                printk(KERN_EMERG "Starting new kernel\n");
                machine_shutdown();
        }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a0d367a49122..ceeadfcabb76 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2066,7 +2066,7 @@ static int __init init_kprobes(void)
 {
        int i, err = 0;
        unsigned long offset = 0, size = 0;
-        char *modname, namebuf[128];
+        char *modname, namebuf[KSYM_NAME_LEN];
        const char *symbol_name;
        void *addr;
        struct kprobe_blackpoint *kb;
@@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        const char *sym = NULL;
        unsigned int i = *(loff_t *) v;
        unsigned long offset = 0;
-        char *modname, namebuf[128];
+        char *modname, namebuf[KSYM_NAME_LEN];
        head = &kprobe_table[i];
        preempt_disable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 760e86df8c20..b5ae3ee860a9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -33,7 +33,7 @@ struct kthread_create_info
        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
-        struct completion done;
+        struct completion *done;
        struct list_head list;
 };
@@ -178,6 +178,7 @@ static int kthread(void *_create)
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
+        struct completion *done;
        struct kthread self;
        int ret;
@@ -187,10 +188,16 @@ static int kthread(void *_create)
        init_completion(&self.parked);
        current->vfork_done = &self.exited;
+        /* If user was SIGKILLed, I release the structure. */
+        done = xchg(&create->done, NULL);
+        if (!done) {
+                kfree(create);
+                do_exit(-EINTR);
+        }
        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
-        complete(&create->done);
+        complete(done);
        schedule();
        ret = -EINTR;
@@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create)
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
+                /* If user was SIGKILLed, I release the structure. */
+                struct completion *done = xchg(&create->done, NULL);
+                if (!done) {
+                        kfree(create);
+                        return;
+                }
                create->result = ERR_PTR(pid);
-                complete(&create->done);
+                complete(done);
        }
 }
@@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           const char namefmt[],
                                           ...)
 {
-        struct kthread_create_info create;
+        DECLARE_COMPLETION_ONSTACK(done);
+        struct task_struct *task;
-        create.threadfn = threadfn;
+        struct kthread_create_info *create = kmalloc(sizeof(*create),
-        create.data = data;
+                                                     GFP_KERNEL);
-        create.node = node;
-        init_completion(&create.done);
+        if (!create)
+                return ERR_PTR(-ENOMEM);
+        create->threadfn = threadfn;
+        create->data = data;
+        create->node = node;
+        create->done = &done;
        spin_lock(&kthread_create_lock);
-        list_add_tail(&create.list, &kthread_create_list);
+        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);
        wake_up_process(kthreadd_task);
-        wait_for_completion(&create.done);
+        /*
+         * Wait for completion in killable state, for I might be chosen by
-        if (!IS_ERR(create.result)) {
+         * the OOM killer while kthreadd is trying to allocate memory for
+         * new kernel thread.
+         */
+        if (unlikely(wait_for_completion_killable(&done))) {
+                /*
+                 * If I was SIGKILLed before kthreadd (or new kernel thread)
+                 * calls complete(), leave the cleanup of this structure to
+                 * that thread.
+                 */
+                if (xchg(&create->done, NULL))
+                        return ERR_PTR(-ENOMEM);
+                /*
+                 * kthreadd (or new kernel thread) will call complete()
+                 * shortly.
+                 */
+                wait_for_completion(&done);
+        }
+        task = create->result;
+        if (!IS_ERR(task)) {
                static const struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
-                vsnprintf(create.result->comm, sizeof(create.result->comm),
+                vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
-                          namefmt, args);
                va_end(args);
                /*
                 * root may have changed our (kthreadd's) priority or CPU mask.
                 * The kernel thread should not inherit these properties.
                 */
-                sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+                sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-                set_cpus_allowed_ptr(create.result, cpu_all_mask);
+                set_cpus_allowed_ptr(task, cpu_all_mask);
        }
-        return create.result;
+        kfree(create);
+        return task;
 }
 EXPORT_SYMBOL(kthread_create_on_node);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
new file mode 100644
index 000000000000..baab8e5e7f66
--- /dev/null
+++ b/kernel/locking/Makefile
@@ -0,0 +1,25 @@
+obj-y += mutex.o semaphore.o rwsem.o lglock.o
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+endif
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
+obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
+obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
diff --git a/kernel/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/locking/lglock.c
diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c
index e16c45b9ee77..576ba756a32d 100644
--- a/kernel/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data)
        return 0;
 }
-unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
 {
        unsigned long  count = 0;
        struct lock_list *uninitialized_var(target_entry);
@@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
        return ret;
 }
-unsigned long __lockdep_count_backward_deps(struct lock_list *this)
+static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
 {
        unsigned long  count = 0;
        struct lock_list *uninitialized_var(target_entry);
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
               !rcu_lockdep_current_cpu_online()
                        ? "RCU used illegally from offline CPU!\n"
-                        : rcu_is_cpu_idle()
+                        : !rcu_is_watching()
                                ? "RCU used illegally from idle CPU!\n"
                                : "",
               rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
         * So complain bitterly if someone does call rcu_read_lock(),
         * rcu_read_lock_bh() and so on from extended quiescent states.
         */
-        if (rcu_is_cpu_idle())
+        if (!rcu_is_watching())
                printk("RCU used illegally from extended quiescent state!\n");
        lockdep_print_held_locks(curr);
diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b2c71c5873e4..ef43ac4bafb5 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
        seq_time(m, lt->min);
        seq_time(m, lt->max);
        seq_time(m, lt->total);
+        seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
 }
 static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
        }
        if (i) {
                seq_puts(m, "\n");
-                seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
+                seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
                seq_puts(m, "\n");
        }
 }
 static void seq_header(struct seq_file *m)
 {
-        seq_printf(m, "lock_stat version 0.3\n");
+        seq_puts(m, "lock_stat version 0.4\n");
        if (unlikely(!debug_locks))
                seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
-        seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+        seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
-        seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
+        seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
                        "%14s %14s\n",
                        "class name",
                        "con-bounces",
@@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m)
                        "waittime-min",
                        "waittime-max",
                        "waittime-total",
+                        "waittime-avg",
                        "acq-bounces",
                        "acquisitions",
                        "holdtime-min",
                        "holdtime-max",
-                        "holdtime-total");
+                        "holdtime-total",
-        seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+                        "holdtime-avg");
+        seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
        seq_printf(m, "\n");
 }
diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..995b0cc2b84c 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f48..7e3443fe1f48 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
diff --git a/kernel/mutex.c b/kernel/locking/mutex.c
index d24105b1b794..4dd6e4c219de 100644
--- a/kernel/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,5 +1,5 @@
 /*
- * kernel/mutex.c
+ * kernel/locking/mutex.c
 *
 * Mutexes: blocking mutual exclusion locks
 *
diff --git a/kernel/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/locking/mutex.h
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
new file mode 100644
index 000000000000..652a8ee8efe9
--- /dev/null
+++ b/kernel/locking/percpu-rwsem.c
@@ -0,0 +1,165 @@
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/percpu.h>
+#include <linux/wait.h>
+#include <linux/lockdep.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+                        const char *name, struct lock_class_key *rwsem_key)
+{
+        brw->fast_read_ctr = alloc_percpu(int);
+        if (unlikely(!brw->fast_read_ctr))
+                return -ENOMEM;
+        /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
+        __init_rwsem(&brw->rw_sem, name, rwsem_key);
+        atomic_set(&brw->write_ctr, 0);
+        atomic_set(&brw->slow_read_ctr, 0);
+        init_waitqueue_head(&brw->write_waitq);
+        return 0;
+}
+void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+{
+        free_percpu(brw->fast_read_ctr);
+        brw->fast_read_ctr = NULL; /* catch use after free bugs */
+}
+/*
+ * This is the fast-path for down_read/up_read, it only needs to ensure
+ * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
+ * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
+ * serialize with the preempt-disabled section below.
+ *
+ * The nontrivial part is that we should guarantee acquire/release semantics
+ * in case when
+ *
+ *      R_W: down_write() comes after up_read(), the writer should see all
+ *           changes done by the reader
+ * or
+ *      W_R: down_read() comes after up_write(), the reader should see all
+ *           changes done by the writer
+ *
+ * If this helper fails the callers rely on the normal rw_semaphore and
+ * atomic_dec_and_test(), so in this case we have the necessary barriers.
+ *
+ * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
+ * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
+ * reader inside the critical section. See the comments in down_write and
+ * up_write below.
+ */
+static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+{
+        bool success = false;
+        preempt_disable();
+        if (likely(!atomic_read(&brw->write_ctr))) {
+                __this_cpu_add(*brw->fast_read_ctr, val);
+                success = true;
+        }
+        preempt_enable();
+        return success;
+}
+/*
+ * Like the normal down_read() this is not recursive, the writer can
+ * come after the first percpu_down_read() and create the deadlock.
+ *
+ * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
+ * percpu_up_read() does rwsem_release(). This pairs with the usage
+ * of ->rw_sem in percpu_down/up_write().
+ */
+void percpu_down_read(struct percpu_rw_semaphore *brw)
+{
+        might_sleep();
+        if (likely(update_fast_ctr(brw, +1))) {
+                rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+                return;
+        }
+        down_read(&brw->rw_sem);
+        atomic_inc(&brw->slow_read_ctr);
+        /* avoid up_read()->rwsem_release() */
+        __up_read(&brw->rw_sem);
+}
+void percpu_up_read(struct percpu_rw_semaphore *brw)
+{
+        rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+        if (likely(update_fast_ctr(brw, -1)))
+                return;
+        /* false-positive is possible but harmless */
+        if (atomic_dec_and_test(&brw->slow_read_ctr))
+                wake_up_all(&brw->write_waitq);
+}
+static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+{
+        unsigned int sum = 0;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                sum += per_cpu(*brw->fast_read_ctr, cpu);
+                per_cpu(*brw->fast_read_ctr, cpu) = 0;
+        }
+        return sum;
+}
+/*
+ * A writer increments ->write_ctr to force the readers to switch to the
+ * slow mode, note the atomic_read() check in update_fast_ctr().
+ *
+ * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
+ * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
+ * counter it represents the number of active readers.
+ *
+ * Finally the writer takes ->rw_sem for writing and blocks the new readers,
+ * then waits until the slow counter becomes zero.
+ */
+void percpu_down_write(struct percpu_rw_semaphore *brw)
+{
+        /* tell update_fast_ctr() there is a pending writer */
+        atomic_inc(&brw->write_ctr);
+        /*
+         * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
+         *    so that update_fast_ctr() can't succeed.
+         *
+         * 2. Ensures we see the result of every previous this_cpu_add() in
+         *    update_fast_ctr().
+         *
+         * 3. Ensures that if any reader has exited its critical section via
+         *    fast-path, it executes a full memory barrier before we return.
+         *    See R_W case in the comment above update_fast_ctr().
+         */
+        synchronize_sched_expedited();
+        /* exclude other writers, and block the new readers completely */
+        down_write(&brw->rw_sem);
+        /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
+        atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+        /* wait for all readers to complete their percpu_up_read() */
+        wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+}
+void percpu_up_write(struct percpu_rw_semaphore *brw)
+{
+        /* release the lock, but the readers can't use the fast-path */
+        up_write(&brw->rw_sem);
+        /*
+         * Insert the barrier before the next fast-path in down_read,
+         * see W_R case in the comment above update_fast_ctr().
+         */
+        synchronize_sched_expedited();
+        /* the last writer unblocks update_fast_ctr() */
+        atomic_dec(&brw->write_ctr);
+}
diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..14193d596d78 100644
--- a/kernel/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
index 1d96dd0d93c1..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/locking/rtmutex-tester.c
diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/locking/rtmutex.c
diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..a1a1dd06421d 100644
--- a/kernel/rtmutex.h
+++ b/kernel/locking/rtmutex.h
diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
new file mode 100644
index 000000000000..9be8a9144978
--- /dev/null
+++ b/kernel/locking/rwsem-spinlock.c
@@ -0,0 +1,296 @@
+/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
+ * generic spinlock implementation
+ *
+ * Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+enum rwsem_waiter_type {
+        RWSEM_WAITING_FOR_WRITE,
+        RWSEM_WAITING_FOR_READ
+};
+struct rwsem_waiter {
+        struct list_head list;
+        struct task_struct *task;
+        enum rwsem_waiter_type type;
+};
+int rwsem_is_locked(struct rw_semaphore *sem)
+{
+        int ret = 1;
+        unsigned long flags;
+        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
+                ret = (sem->activity != 0);
+                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        }
+        return ret;
+}
+EXPORT_SYMBOL(rwsem_is_locked);
+/*
+ * initialise the semaphore
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+                  struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /*
+         * Make sure we are not reinitializing a held semaphore:
+         */
+        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+        lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+        sem->activity = 0;
+        raw_spin_lock_init(&sem->wait_lock);
+        INIT_LIST_HEAD(&sem->wait_list);
+}
+EXPORT_SYMBOL(__init_rwsem);
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here, then:
+ *   - the 'active count' _reached_ zero
+ *   - the 'waiting count' is non-zero
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if wakewrite is non-zero
+ */
+static inline struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+{
+        struct rwsem_waiter *waiter;
+        struct task_struct *tsk;
+        int woken;
+        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+                if (wakewrite)
+                        /* Wake up a writer. Note that we do not grant it the
+                         * lock - it will have to acquire it when it runs. */
+                        wake_up_process(waiter->task);
+                goto out;
+        }
+        /* grant an infinite number of read locks to the front of the queue */
+        woken = 0;
+        do {
+                struct list_head *next = waiter->list.next;
+                list_del(&waiter->list);
+                tsk = waiter->task;
+                smp_mb();
+                waiter->task = NULL;
+                wake_up_process(tsk);
+                put_task_struct(tsk);
+                woken++;
+                if (next == &sem->wait_list)
+                        break;
+                waiter = list_entry(next, struct rwsem_waiter, list);
+        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+        sem->activity += woken;
+ out:
+        return sem;
+}
+/*
+ * wake a single writer
+ */
+static inline struct rw_semaphore *
+__rwsem_wake_one_writer(struct rw_semaphore *sem)
+{
+        struct rwsem_waiter *waiter;
+        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+        wake_up_process(waiter->task);
+        return sem;
+}
+/*
+ * get a read lock on the semaphore
+ */
+void __sched __down_read(struct rw_semaphore *sem)
+{
+        struct rwsem_waiter waiter;
+        struct task_struct *tsk;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+                /* granted */
+                sem->activity++;
+                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+                goto out;
+        }
+        tsk = current;
+        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+        /* set up my own style of waitqueue */
+        waiter.task = tsk;
+        waiter.type = RWSEM_WAITING_FOR_READ;
+        get_task_struct(tsk);
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we don't need to touch the semaphore struct anymore */
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        /* wait to be given the lock */
+        for (;;) {
+                if (!waiter.task)
+                        break;
+                schedule();
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+        }
+        tsk->state = TASK_RUNNING;
+ out:
+        ;
+}
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int __down_read_trylock(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        int ret = 0;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+                /* granted */
+                sem->activity++;
+                ret = 1;
+        }
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        return ret;
+}
+/*
+ * get a write lock on the semaphore
+ */
+void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+        struct rwsem_waiter waiter;
+        struct task_struct *tsk;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        /* set up my own style of waitqueue */
+        tsk = current;
+        waiter.task = tsk;
+        waiter.type = RWSEM_WAITING_FOR_WRITE;
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* wait for someone to release the lock */
+        for (;;) {
+                /*
+                 * That is the key to support write lock stealing: allows the
+                 * task already on CPU to get the lock soon rather than put
+                 * itself into sleep and waiting for system woke it or someone
+                 * else in the head of the wait list up.
+                 */
+                if (sem->activity == 0)
+                        break;
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+                schedule();
+                raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        }
+        /* got the lock */
+        sem->activity = -1;
+        list_del(&waiter.list);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+void __sched __down_write(struct rw_semaphore *sem)
+{
+        __down_write_nested(sem, 0);
+}
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int __down_write_trylock(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        int ret = 0;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        if (sem->activity == 0) {
+                /* got the lock */
+                sem->activity = -1;
+                ret = 1;
+        }
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        return ret;
+}
+/*
+ * release a read lock on the semaphore
+ */
+void __up_read(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+                sem = __rwsem_wake_one_writer(sem);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+/*
+ * release a write lock on the semaphore
+ */
+void __up_write(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        sem->activity = 0;
+        if (!list_empty(&sem->wait_list))
+                sem = __rwsem_do_wake(sem, 1);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void __downgrade_write(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        sem->activity = 1;
+        if (!list_empty(&sem->wait_list))
+                sem = __rwsem_do_wake(sem, 0);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
new file mode 100644
index 000000000000..19c5fa95e0b4
--- /dev/null
+++ b/kernel/locking/rwsem-xadd.c
@@ -0,0 +1,293 @@
+/* rwsem.c: R/W semaphores: contention handling functions
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/export.h>
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+                  struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /*
+         * Make sure we are not reinitializing a held semaphore:
+         */
+        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+        lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+        sem->count = RWSEM_UNLOCKED_VALUE;
+        raw_spin_lock_init(&sem->wait_lock);
+        INIT_LIST_HEAD(&sem->wait_list);
+}
+EXPORT_SYMBOL(__init_rwsem);
+enum rwsem_waiter_type {
+        RWSEM_WAITING_FOR_WRITE,
+        RWSEM_WAITING_FOR_READ
+};
+struct rwsem_waiter {
+        struct list_head list;
+        struct task_struct *task;
+        enum rwsem_waiter_type type;
+};
+enum rwsem_wake_type {
+        RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
+        RWSEM_WAKE_READERS,     /* Wake readers only */
+        RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
+};
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then:
+ *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
+ *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
+ * - there must be someone on the queue
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if downgrading is false
+ */
+static struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+{
+        struct rwsem_waiter *waiter;
+        struct task_struct *tsk;
+        struct list_head *next;
+        long oldcount, woken, loop, adjustment;
+        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+                if (wake_type == RWSEM_WAKE_ANY)
+                        /* Wake writer at the front of the queue, but do not
+                         * grant it the lock yet as we want other writers
+                         * to be able to steal it.  Readers, on the other hand,
+                         * will block as they will notice the queued writer.
+                         */
+                        wake_up_process(waiter->task);
+                goto out;
+        }
+        /* Writers might steal the lock before we grant it to the next reader.
+         * We prefer to do the first reader grant before counting readers
+         * so we can bail out early if a writer stole the lock.
+         */
+        adjustment = 0;
+        if (wake_type != RWSEM_WAKE_READ_OWNED) {
+                adjustment = RWSEM_ACTIVE_READ_BIAS;
+ try_reader_grant:
+                oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+                if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+                        /* A writer stole the lock. Undo our reader grant. */
+                        if (rwsem_atomic_update(-adjustment, sem) &
+                                                RWSEM_ACTIVE_MASK)
+                                goto out;
+                        /* Last active locker left. Retry waking readers. */
+                        goto try_reader_grant;
+                }
+        }
+        /* Grant an infinite number of read locks to the readers at the front
+         * of the queue.  Note we increment the 'active part' of the count by
+         * the number of readers before waking any processes up.
+         */
+        woken = 0;
+        do {
+                woken++;
+                if (waiter->list.next == &sem->wait_list)
+                        break;
+                waiter = list_entry(waiter->list.next,
+                                        struct rwsem_waiter, list);
+        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+        if (waiter->type != RWSEM_WAITING_FOR_WRITE)
+                /* hit end of list above */
+                adjustment -= RWSEM_WAITING_BIAS;
+        if (adjustment)
+                rwsem_atomic_add(adjustment, sem);
+        next = sem->wait_list.next;
+        loop = woken;
+        do {
+                waiter = list_entry(next, struct rwsem_waiter, list);
+                next = waiter->list.next;
+                tsk = waiter->task;
+                smp_mb();
+                waiter->task = NULL;
+                wake_up_process(tsk);
+                put_task_struct(tsk);
+        } while (--loop);
+        sem->wait_list.next = next;
+        next->prev = &sem->wait_list;
+ out:
+        return sem;
+}
+/*
+ * wait for the read lock to be granted
+ */
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+        struct rwsem_waiter waiter;
+        struct task_struct *tsk = current;
+        /* set up my own style of waitqueue */
+        waiter.task = tsk;
+        waiter.type = RWSEM_WAITING_FOR_READ;
+        get_task_struct(tsk);
+        raw_spin_lock_irq(&sem->wait_lock);
+        if (list_empty(&sem->wait_list))
+                adjustment += RWSEM_WAITING_BIAS;
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we're now waiting on the lock, but no longer actively locking */
+        count = rwsem_atomic_update(adjustment, sem);
+        /* If there are no active locks, wake the front queued process(es).
+         *
+         * If there are no writers and we are first in the queue,
+         * wake our own waiter to join the existing active readers !
+         */
+        if (count == RWSEM_WAITING_BIAS ||
+            (count > RWSEM_WAITING_BIAS &&
+             adjustment != -RWSEM_ACTIVE_READ_BIAS))
+                sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        /* wait to be given the lock */
+        while (true) {
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                if (!waiter.task)
+                        break;
+                schedule();
+        }
+        tsk->state = TASK_RUNNING;
+        return sem;
+}
+/*
+ * wait until we successfully acquire the write lock
+ */
+struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+        long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+        struct rwsem_waiter waiter;
+        struct task_struct *tsk = current;
+        /* set up my own style of waitqueue */
+        waiter.task = tsk;
+        waiter.type = RWSEM_WAITING_FOR_WRITE;
+        raw_spin_lock_irq(&sem->wait_lock);
+        if (list_empty(&sem->wait_list))
+                adjustment += RWSEM_WAITING_BIAS;
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we're now waiting on the lock, but no longer actively locking */
+        count = rwsem_atomic_update(adjustment, sem);
+        /* If there were already threads queued before us and there are no
+         * active writers, the lock must be read owned; so we try to wake
+         * any read locks that were queued ahead of us. */
+        if (count > RWSEM_WAITING_BIAS &&
+            adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+        /* wait until we successfully acquire the lock */
+        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+        while (true) {
+                if (!(count & RWSEM_ACTIVE_MASK)) {
+                        /* Try acquiring the write lock. */
+                        count = RWSEM_ACTIVE_WRITE_BIAS;
+                        if (!list_is_singular(&sem->wait_list))
+                                count += RWSEM_WAITING_BIAS;
+                        if (sem->count == RWSEM_WAITING_BIAS &&
+                            cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+                                                        RWSEM_WAITING_BIAS)
+                                break;
+                }
+                raw_spin_unlock_irq(&sem->wait_lock);
+                /* Block until there are no active lockers. */
+                do {
+                        schedule();
+                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+                raw_spin_lock_irq(&sem->wait_lock);
+        }
+        list_del(&waiter.list);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        tsk->state = TASK_RUNNING;
+        return sem;
+}
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        /* do nothing if list empty */
+        if (!list_empty(&sem->wait_list))
+                sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        return sem;
+}
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        /* do nothing if list empty */
+        if (!list_empty(&sem->wait_list))
+                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        return sem;
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+EXPORT_SYMBOL(rwsem_down_write_failed);
+EXPORT_SYMBOL(rwsem_wake);
+EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/locking/rwsem.c
diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/locking/semaphore.c
diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/locking/spinlock.c
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
new file mode 100644
index 000000000000..0374a596cffa
--- /dev/null
+++ b/kernel/locking/spinlock_debug.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ *
+ * This file contains the spinlock/rwlock implementations for
+ * DEBUG_SPINLOCK.
+ */
+#include <linux/spinlock.h>
+#include <linux/nmi.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+                          struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /*
+         * Make sure we are not reinitializing a held lock:
+         */
+        debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+        lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+        lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+        lock->magic = SPINLOCK_MAGIC;
+        lock->owner = SPINLOCK_OWNER_INIT;
+        lock->owner_cpu = -1;
+}
+EXPORT_SYMBOL(__raw_spin_lock_init);
+void __rwlock_init(rwlock_t *lock, const char *name,
+                   struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /*
+         * Make sure we are not reinitializing a held lock:
+         */
+        debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+        lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+        lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
+        lock->magic = RWLOCK_MAGIC;
+        lock->owner = SPINLOCK_OWNER_INIT;
+        lock->owner_cpu = -1;
+}
+EXPORT_SYMBOL(__rwlock_init);
+static void spin_dump(raw_spinlock_t *lock, const char *msg)
+{
+        struct task_struct *owner = NULL;
+        if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
+                owner = lock->owner;
+        printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
+                msg, raw_smp_processor_id(),
+                current->comm, task_pid_nr(current));
+        printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
+                        ".owner_cpu: %d\n",
+                lock, lock->magic,
+                owner ? owner->comm : "<none>",
+                owner ? task_pid_nr(owner) : -1,
+                lock->owner_cpu);
+        dump_stack();
+}
+static void spin_bug(raw_spinlock_t *lock, const char *msg)
+{
+        if (!debug_locks_off())
+                return;
+        spin_dump(lock, msg);
+}
+#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
+static inline void
+debug_spin_lock_before(raw_spinlock_t *lock)
+{
+        SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+        SPIN_BUG_ON(lock->owner == current, lock, "recursion");
+        SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+                                                        lock, "cpu recursion");
+}
+static inline void debug_spin_lock_after(raw_spinlock_t *lock)
+{
+        lock->owner_cpu = raw_smp_processor_id();
+        lock->owner = current;
+}
+static inline void debug_spin_unlock(raw_spinlock_t *lock)
+{
+        SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+        SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
+        SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
+        SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+                                                        lock, "wrong CPU");
+        lock->owner = SPINLOCK_OWNER_INIT;
+        lock->owner_cpu = -1;
+}
+static void __spin_lock_debug(raw_spinlock_t *lock)
+{
+        u64 i;
+        u64 loops = loops_per_jiffy * HZ;
+        for (i = 0; i < loops; i++) {
+                if (arch_spin_trylock(&lock->raw_lock))
+                        return;
+                __delay(1);
+        }
+        /* lockup suspected: */
+        spin_dump(lock, "lockup suspected");
+#ifdef CONFIG_SMP
+        trigger_all_cpu_backtrace();
+#endif
+        /*
+         * The trylock above was causing a livelock.  Give the lower level arch
+         * specific lock code a chance to acquire the lock. We have already
+         * printed a warning/backtrace at this point. The non-debug arch
+         * specific code might actually succeed in acquiring the lock.  If it is
+         * not successful, the end-result is the same - there is no forward
+         * progress.
+         */
+        arch_spin_lock(&lock->raw_lock);
+}
+void do_raw_spin_lock(raw_spinlock_t *lock)
+{
+        debug_spin_lock_before(lock);
+        if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
+                __spin_lock_debug(lock);
+        debug_spin_lock_after(lock);
+}
+int do_raw_spin_trylock(raw_spinlock_t *lock)
+{
+        int ret = arch_spin_trylock(&lock->raw_lock);
+        if (ret)
+                debug_spin_lock_after(lock);
+#ifndef CONFIG_SMP
+        /*
+         * Must not happen on UP:
+         */
+        SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+        return ret;
+}
+void do_raw_spin_unlock(raw_spinlock_t *lock)
+{
+        debug_spin_unlock(lock);
+        arch_spin_unlock(&lock->raw_lock);
+}
+static void rwlock_bug(rwlock_t *lock, const char *msg)
+{
+        if (!debug_locks_off())
+                return;
+        printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
+                msg, raw_smp_processor_id(), current->comm,
+                task_pid_nr(current), lock);
+        dump_stack();
+}
+#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
+#if 0           /* __write_lock_debug() can lock up - maybe this can too? */
+static void __read_lock_debug(rwlock_t *lock)
+{
+        u64 i;
+        u64 loops = loops_per_jiffy * HZ;
+        int print_once = 1;
+        for (;;) {
+                for (i = 0; i < loops; i++) {
+                        if (arch_read_trylock(&lock->raw_lock))
+                                return;
+                        __delay(1);
+                }
+                /* lockup suspected: */
+                if (print_once) {
+                        print_once = 0;
+                        printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
+                                        "%s/%d, %p\n",
+                                raw_smp_processor_id(), current->comm,
+                                current->pid, lock);
+                        dump_stack();
+                }
+        }
+}
+#endif
+void do_raw_read_lock(rwlock_t *lock)
+{
+        RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+        arch_read_lock(&lock->raw_lock);
+}
+int do_raw_read_trylock(rwlock_t *lock)
+{
+        int ret = arch_read_trylock(&lock->raw_lock);
+#ifndef CONFIG_SMP
+        /*
+         * Must not happen on UP:
+         */
+        RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+        return ret;
+}
+void do_raw_read_unlock(rwlock_t *lock)
+{
+        RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+        arch_read_unlock(&lock->raw_lock);
+}
+static inline void debug_write_lock_before(rwlock_t *lock)
+{
+        RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+        RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
+        RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+                                                        lock, "cpu recursion");
+}
+static inline void debug_write_lock_after(rwlock_t *lock)
+{
+        lock->owner_cpu = raw_smp_processor_id();
+        lock->owner = current;
+}
+static inline void debug_write_unlock(rwlock_t *lock)
+{
+        RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+        RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
+        RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+                                                        lock, "wrong CPU");
+        lock->owner = SPINLOCK_OWNER_INIT;
+        lock->owner_cpu = -1;
+}
+#if 0           /* This can cause lockups */
+static void __write_lock_debug(rwlock_t *lock)
+{
+        u64 i;
+        u64 loops = loops_per_jiffy * HZ;
+        int print_once = 1;
+        for (;;) {
+                for (i = 0; i < loops; i++) {
+                        if (arch_write_trylock(&lock->raw_lock))
+                                return;
+                        __delay(1);
+                }
+                /* lockup suspected: */
+                if (print_once) {
+                        print_once = 0;
+                        printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
+                                        "%s/%d, %p\n",
+                                raw_smp_processor_id(), current->comm,
+                                current->pid, lock);
+                        dump_stack();
+                }
+        }
+}
+#endif
+void do_raw_write_lock(rwlock_t *lock)
+{
+        debug_write_lock_before(lock);
+        arch_write_lock(&lock->raw_lock);
+        debug_write_lock_after(lock);
+}
+int do_raw_write_trylock(rwlock_t *lock)
+{
+        int ret = arch_write_trylock(&lock->raw_lock);
+        if (ret)
+                debug_write_lock_after(lock);
+#ifndef CONFIG_SMP
+        /*
+         * Must not happen on UP:
+         */
+        RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+        return ret;
+}
+void do_raw_write_unlock(rwlock_t *lock)
+{
+        debug_write_unlock(lock);
+        arch_write_unlock(&lock->raw_lock);
+}
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
deleted file mode 100644
index 4a9a86d12c8b..000000000000
--- a/kernel/modsign_certificate.S
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <linux/export.h>
-#define GLOBAL(name)    \
-        .globl VMLINUX_SYMBOL(name);    \
-        VMLINUX_SYMBOL(name):
-        .section ".init.data","aw"
-GLOBAL(modsign_certificate_list)
-        .incbin "signing_key.x509"
-        .incbin "extra_certificates"
-GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
deleted file mode 100644
index 7cbd4507a7e6..000000000000
--- a/kernel/modsign_pubkey.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Public keys for module signature verification
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include "module-internal.h"
-struct key *modsign_keyring;
-extern __initconst const u8 modsign_certificate_list[];
-extern __initconst const u8 modsign_certificate_list_end[];
-/*
- * We need to make sure ccache doesn't cache the .o file as it doesn't notice
- * if modsign.pub changes.
- */
-static __initconst const char annoy_ccache[] = __TIME__ "foo";
-/*
- * Load the compiled-in keys
- */
-static __init int module_verify_init(void)
-{
-        pr_notice("Initialise module verification\n");
-        modsign_keyring = keyring_alloc(".module_sign",
-                                        KUIDT_INIT(0), KGIDT_INIT(0),
-                                        current_cred(),
-                                        ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                         KEY_USR_VIEW | KEY_USR_READ),
-                                        KEY_ALLOC_NOT_IN_QUOTA, NULL);
-        if (IS_ERR(modsign_keyring))
-                panic("Can't allocate module signing keyring\n");
-        return 0;
-}
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(module_verify_init);
-/*
- * Load the compiled-in keys
- */
-static __init int load_module_signing_keys(void)
-{
-        key_ref_t key;
-        const u8 *p, *end;
-        size_t plen;
-        pr_notice("Loading module verification certificates\n");
-        end = modsign_certificate_list_end;
-        p = modsign_certificate_list;
-        while (p < end) {
-                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-                 * than 256 bytes in size.
-                 */
-                if (end - p < 4)
-                        goto dodgy_cert;
-                if (p[0] != 0x30 &&
-                    p[1] != 0x82)
-                        goto dodgy_cert;
-                plen = (p[2] << 8) | p[3];
-                plen += 4;
-                if (plen > end - p)
-                        goto dodgy_cert;
-                key = key_create_or_update(make_key_ref(modsign_keyring, 1),
-                                           "asymmetric",
-                                           NULL,
-                                           p,
-                                           plen,
-                                           (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                           KEY_USR_VIEW,
-                                           KEY_ALLOC_NOT_IN_QUOTA);
-                if (IS_ERR(key))
-                        pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
-                               PTR_ERR(key));
-                else
-                        pr_notice("MODSIGN: Loaded cert '%s'\n",
-                                  key_ref_to_ptr(key)->description);
-                p += plen;
-        }
-        return 0;
-dodgy_cert:
-        pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
-        return 0;
-}
-late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 24f9247b7d02..915e123a430f 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -9,6 +9,4 @@
 * 2 of the Licence, or (at your option) any later version.
 */
-extern struct key *modsign_keyring;
 extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index dc582749fa13..f5a3b1e8ec51 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms,
                if (syms->licence == GPL_ONLY)
                        return false;
                if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
-                        printk(KERN_WARNING "Symbol %s is being used "
+                        pr_warn("Symbol %s is being used by a non-GPL module, "
-                               "by a non-GPL module, which will not "
+                                "which will not be allowed in the future\n",
-                               "be allowed in the future\n", fsa->name);
+                                fsa->name);
                }
        }
 #ifdef CONFIG_UNUSED_SYMBOLS
        if (syms->unused && fsa->warn) {
-                printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+                pr_warn("Symbol %s is marked as UNUSED, however this module is "
-                       "however this module is using it.\n", fsa->name);
+                        "using it.\n", fsa->name);
-                printk(KERN_WARNING
+                pr_warn("This symbol will go away in the future.\n");
-                       "This symbol will go away in the future.\n");
+                pr_warn("Please evalute if this is the right api to use and if "
-                printk(KERN_WARNING
+                        "it really is, submit a report the linux kernel "
-                       "Please evalute if this is the right api to use and if "
+                        "mailinglist together with submitting your code for "
-                       "it really is, submit a report the linux kernel "
+                        "inclusion.\n");
-                       "mailinglist together with submitting your code for "
-                       "inclusion.\n");
        }
 #endif
@@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info)
                return 0;
        if (align > PAGE_SIZE) {
-                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+                pr_warn("%s: per-cpu alignment %li > %li\n",
-                       mod->name, align, PAGE_SIZE);
+                        mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
        mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
        if (!mod->percpu) {
-                printk(KERN_WARNING
+                pr_warn("%s: Could not allocate %lu bytes percpu data\n",
-                       "%s: Could not allocate %lu bytes percpu data\n",
+                        mod->name, (unsigned long)pcpusec->sh_size);
-                       mod->name, (unsigned long)pcpusec->sh_size);
                return -ENOMEM;
        }
        mod->percpu_size = pcpusec->sh_size;
@@ -644,8 +641,6 @@ static int module_unload_init(struct module *mod)
        /* Hold reference count during initialization. */
        __this_cpu_write(mod->refptr->incs, 1);
-        /* Backwards compatibility macros put refcount during init. */
-        mod->waiter = current;
        return 0;
 }
@@ -679,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b)
        pr_debug("Allocating new usage for %s.\n", a->name);
        use = kmalloc(sizeof(*use), GFP_ATOMIC);
        if (!use) {
-                printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+                pr_warn("%s: out of memory loading\n", a->name);
                return -ENOMEM;
        }
@@ -771,16 +766,9 @@ static int __try_stop_module(void *_sref)
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-        if (flags & O_NONBLOCK) {
+        struct stopref sref = { mod, flags, forced };
-                struct stopref sref = { mod, flags, forced };
-                return stop_machine(__try_stop_module, &sref, NULL);
+        return stop_machine(__try_stop_module, &sref, NULL);
-        } else {
-                /* We don't need to stop the machine for this. */
-                mod->state = MODULE_STATE_GOING;
-                synchronize_sched();
-                return 0;
-        }
 }
 unsigned long module_refcount(struct module *mod)
@@ -813,21 +801,6 @@ EXPORT_SYMBOL(module_refcount);
 /* This exists whether we can unload or not */
 static void free_module(struct module *mod);
-static void wait_for_zero_refcount(struct module *mod)
-{
-        /* Since we might sleep for some time, release the mutex first */
-        mutex_unlock(&module_mutex);
-        for (;;) {
-                pr_debug("Looking at refcount...\n");
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (module_refcount(mod) == 0)
-                        break;
-                schedule();
-        }
-        current->state = TASK_RUNNING;
-        mutex_lock(&module_mutex);
-}
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                unsigned int, flags)
 {
@@ -842,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
+        if (!(flags & O_NONBLOCK)) {
+                printk(KERN_WARNING
+                       "waiting module removal not supported: please upgrade");
+        }
        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
@@ -859,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        /* Doing init or already dying? */
        if (mod->state != MODULE_STATE_LIVE) {
-                /* FIXME: if (force), slam module count and wake up
+                /* FIXME: if (force), slam module count damn the torpedoes */
-                   waiter --RR */
                pr_debug("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
@@ -876,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                }
        }
-        /* Set this up before setting mod->state */
-        mod->waiter = current;
        /* Stop the machine so refcounts can't move and disable module. */
        ret = try_stop_module(mod, flags, &forced);
        if (ret != 0)
                goto out;
-        /* Never wait if forced. */
-        if (!forced && module_refcount(mod) != 0)
-                wait_for_zero_refcount(mod);
        mutex_unlock(&module_mutex);
        /* Final destruction now no one is using it. */
        if (mod->exit != NULL)
@@ -1005,9 +975,6 @@ void module_put(struct module *module)
                __this_cpu_inc(module->refptr->decs);
                trace_module_put(module, _RET_IP_);
-                /* Maybe they're waiting for us to drop reference? */
-                if (unlikely(!module_is_live(module)))
-                        wake_up_process(module->waiter);
                preempt_enable();
        }
 }
@@ -1145,8 +1112,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
 {
 #ifdef CONFIG_MODULE_FORCE_LOAD
        if (!test_taint(TAINT_FORCED_MODULE))
-                printk(KERN_WARNING "%s: %s: kernel tainted.\n",
+                pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
-                       mod->name, reason);
        add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
        return 0;
 #else
@@ -1199,8 +1165,7 @@ static int check_version(Elf_Shdr *sechdrs,
                goto bad_version;
        }
-        printk(KERN_WARNING "%s: no symbol version for %s\n",
+        pr_warn("%s: no symbol version for %s\n", mod->name, symname);
-               mod->name, symname);
        return 0;
 bad_version:
@@ -1309,8 +1274,8 @@ resolve_symbol_wait(struct module *mod,
                        !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
                        || PTR_ERR(ksym) != -EBUSY,
                                             30 * HZ) <= 0) {
-                printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+                pr_warn("%s: gave up waiting for init of module %s.\n",
-                       mod->name, owner);
+                        mod->name, owner);
        }
        return ksym;
 }
@@ -1626,15 +1591,14 @@ static int mod_sysfs_init(struct module *mod)
        struct kobject *kobj;
        if (!module_sysfs_initialized) {
-                printk(KERN_ERR "%s: module sysfs not initialized\n",
+                pr_err("%s: module sysfs not initialized\n", mod->name);
-                       mod->name);
                err = -EINVAL;
                goto out;
        }
        kobj = kset_find_obj(module_kset, mod->name);
        if (kobj) {
-                printk(KERN_ERR "%s: module is already loaded\n", mod->name);
+                pr_err("%s: module is already loaded\n", mod->name);
                kobject_put(kobj);
                err = -EINVAL;
                goto out;
@@ -1961,8 +1925,7 @@ static int verify_export_symbols(struct module *mod)
        for (i = 0; i < ARRAY_SIZE(arr); i++) {
                for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
                        if (find_symbol(s->name, &owner, NULL, true, false)) {
-                                printk(KERN_ERR
+                                pr_err("%s: exports duplicate symbol %s"
-                                       "%s: exports duplicate symbol %s"
                                       " (owned by %s)\n",
                                       mod->name, s->name, module_name(owner));
                                return -ENOEXEC;
@@ -2013,8 +1976,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                        if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
                                break;
-                        printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
+                        pr_warn("%s: Unknown symbol %s (err %li)\n",
-                               mod->name, name, PTR_ERR(ksym));
+                                mod->name, name, PTR_ERR(ksym));
                        ret = PTR_ERR(ksym) ?: -ENOENT;
                        break;
@@ -2168,8 +2131,8 @@ static void set_license(struct module *mod, const char *license)
        if (!license_is_gpl_compatible(license)) {
                if (!test_taint(TAINT_PROPRIETARY_MODULE))
-                        printk(KERN_WARNING "%s: module license '%s' taints "
+                        pr_warn("%s: module license '%s' taints kernel.\n",
-                                "kernel.\n", mod->name, license);
+                                mod->name, license);
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);
        }
@@ -2405,8 +2368,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
                return;
 #ifdef CONFIG_DYNAMIC_DEBUG
        if (ddebug_add_module(debug, num, debug->modname))
-                printk(KERN_ERR "dynamic debug error adding module: %s\n",
+                pr_err("dynamic debug error adding module: %s\n",
-                                        debug->modname);
+                        debug->modname);
 #endif
 }
@@ -2619,8 +2582,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
                Elf_Shdr *shdr = &info->sechdrs[i];
                if (shdr->sh_type != SHT_NOBITS
                    && info->len < shdr->sh_offset + shdr->sh_size) {
-                        printk(KERN_ERR "Module len %lu truncated\n",
+                        pr_err("Module len %lu truncated\n", info->len);
-                               info->len);
                        return -ENOEXEC;
                }
@@ -2682,15 +2644,14 @@ static struct module *setup_load_info(struct load_info *info, int flags)
        info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
        if (!info->index.mod) {
-                printk(KERN_WARNING "No module found in object\n");
+                pr_warn("No module found in object\n");
                return ERR_PTR(-ENOEXEC);
        }
        /* This is temporary: point mod into copy of data. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        if (info->index.sym == 0) {
-                printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
+                pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
-                       mod->name);
                return ERR_PTR(-ENOEXEC);
        }
@@ -2717,7 +2678,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
                if (err)
                        return err;
        } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
-                printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+                pr_err("%s: version magic '%s' should be '%s'\n",
                       mod->name, modmagic, vermagic);
                return -ENOEXEC;
        }
@@ -2727,9 +2688,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
        if (get_modinfo(info, "staging")) {
                add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
-                printk(KERN_WARNING "%s: module is from the staging directory,"
+                pr_warn("%s: module is from the staging directory, the quality "
-                       " the quality is unknown, you have been warned.\n",
+                        "is unknown, you have been warned.\n", mod->name);
-                       mod->name);
        }
        /* Set up license info based on the info section */
@@ -2738,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
        return 0;
 }
-static void find_module_sections(struct module *mod, struct load_info *info)
+static int find_module_sections(struct module *mod, struct load_info *info)
 {
        mod->kp = section_objs(info, "__param",
                               sizeof(*mod->kp), &mod->num_kp);
@@ -2768,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 #ifdef CONFIG_CONSTRUCTORS
        mod->ctors = section_objs(info, ".ctors",
                                  sizeof(*mod->ctors), &mod->num_ctors);
+        if (!mod->ctors)
+                mod->ctors = section_objs(info, ".init_array",
+                                sizeof(*mod->ctors), &mod->num_ctors);
+        else if (find_sec(info, ".init_array")) {
+                /*
+                 * This shouldn't happen with same compiler and binutils
+                 * building all parts of the module.
+                 */
+                printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+                       mod->name);
+                return -EINVAL;
+        }
 #endif
 #ifdef CONFIG_TRACEPOINTS
@@ -2801,11 +2773,12 @@ static void find_module_sections(struct module *mod, struct load_info *info)
                                    sizeof(*mod->extable), &mod->num_exentries);
        if (section_addr(info, "__obsparm"))
-                printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+                pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
-                       mod->name);
        info->debug = section_objs(info, "__verbose",
                                   sizeof(*info->debug), &info->num_debug);
+        return 0;
 }
 static int move_module(struct module *mod, struct load_info *info)
@@ -3078,11 +3051,10 @@ static int do_init_module(struct module *mod)
                return ret;
        }
        if (ret > 0) {
-                printk(KERN_WARNING
+                pr_warn("%s: '%s'->init suspiciously returned %d, it should "
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+                        "follow 0/-E convention\n"
-"%s: loading module anyway...\n",
+                        "%s: loading module anyway...\n",
-                       __func__, mod->name, ret,
+                        __func__, mod->name, ret, __func__);
-                       __func__);
                dump_stack();
        }
@@ -3205,10 +3177,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname)
 {
        /* Check for magic 'dyndbg' arg */ 
        int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
-        if (ret != 0) {
+        if (ret != 0)
-                printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
+                pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
-                       modname, param);
-        }
        return 0;
 }
@@ -3243,10 +3213,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
 #ifdef CONFIG_MODULE_SIG
        mod->sig_ok = info->sig_ok;
        if (!mod->sig_ok) {
-                printk_once(KERN_NOTICE
+                pr_notice_once("%s: module verification failed: signature "
-                            "%s: module verification failed: signature and/or"
+                               "and/or  required key missing - tainting "
-                            " required key missing - tainting kernel\n",
+                               "kernel\n", mod->name);
-                            mod->name);
                add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
        }
 #endif
@@ -3263,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
        /* Now we've got everything in the final locations, we can
         * find optional sections. */
-        find_module_sections(mod, info);
+        err = find_module_sections(mod, info);
+        if (err)
+                goto free_unload;
        err = check_module_license_and_versions(mod);
        if (err)
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2970bddc5ea..be5b8fac4bd0 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -14,6 +14,7 @@
 #include <crypto/public_key.h>
 #include <crypto/hash.h>
 #include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
 #include "module-internal.h"
 /*
@@ -28,7 +29,7 @@
 */
 struct module_signature {
        u8      algo;           /* Public-key crypto algorithm [enum pkey_algo] */
-        u8      hash;           /* Digest algorithm [enum pkey_hash_algo] */
+        u8      hash;           /* Digest algorithm [enum hash_algo] */
        u8      id_type;        /* Key identifier type [enum pkey_id_type] */
        u8      signer_len;     /* Length of signer's name */
        u8      key_id_len;     /* Length of key identifier */
@@ -39,7 +40,7 @@ struct module_signature {
 /*
 * Digest the module contents.
 */
-static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
+static struct public_key_signature *mod_make_digest(enum hash_algo hash,
                                                    const void *mod,
                                                    unsigned long modlen)
 {
@@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
        /* Allocate the hashing algorithm we're going to need and find out how
         * big the hash operational data will be.
         */
-        tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+        tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
        if (IS_ERR(tfm))
                return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
@@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
        pr_debug("Look up: \"%s\"\n", id);
-        key = keyring_search(make_key_ref(modsign_keyring, 1),
+        key = keyring_search(make_key_ref(system_trusted_keyring, 1),
                             &key_type_asymmetric, id);
        if (IS_ERR(key))
                pr_warn("Request for unknown module key '%s' err %ld\n",
@@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
                return -ENOPKG;
        if (ms.hash >= PKEY_HASH__LAST ||
-            !pkey_hash_algo[ms.hash])
+            !hash_algo_name[ms.hash])
                return -ENOPKG;
        key = request_asymmetric_key(sig, ms.signer_len,
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..2abd25d79cc8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 static int padata_cpu_hash(struct parallel_data *pd)
 {
+        unsigned int seq_nr;
        int cpu_index;
        /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
         * seq_nr mod. number of cpus in use.
         */
-        spin_lock(&pd->seq_lock);
+        seq_nr = atomic_inc_return(&pd->seq_nr);
-        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
-        pd->seq_nr++;
-        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        pd->seq_nr = 0;
+        atomic_set(&pd->seq_nr, -1);
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
diff --git a/kernel/panic.c b/kernel/panic.c
index b6c482ccc5db..c00b4ceb39e8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -233,7 +233,7 @@ static const struct tnt tnts[] = {
 */
 const char *print_tainted(void)
 {
-        static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
+        static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
        if (tainted_mask) {
                char *s;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 42086551a24a..06c62de9c711 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -132,6 +132,12 @@ out:
        return ERR_PTR(err);
 }
+static void delayed_free_pidns(struct rcu_head *p)
+{
+        kmem_cache_free(pid_ns_cachep,
+                        container_of(p, struct pid_namespace, rcu));
+}
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
        int i;
@@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
        for (i = 0; i < PIDMAP_ENTRIES; i++)
                kfree(ns->pidmap[i].page);
        put_user_ns(ns->user_ns);
-        kmem_cache_free(pid_ns_cachep, ns);
+        call_rcu(&ns->rcu, delayed_free_pidns);
 }
 struct pid_namespace *copy_pid_ns(unsigned long flags,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d444c4e834f4..2fac9cc79b3d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG
        def_bool y
        depends on PM_DEBUG && PM_SLEEP
+config DPM_WATCHDOG
+        bool "Device suspend/resume watchdog"
+        depends on PM_DEBUG && PSTORE
+        ---help---
+          Sets up a watchdog timer to capture drivers that are
+          locked up attempting to suspend/resume a device.
+          A detected lockup causes system panic with message
+          captured in pstore device for inspection in subsequent
+          boot session.
+config DPM_WATCHDOG_TIMEOUT
+        int "Watchdog timeout in seconds"
+        range 1 120
+        default 12
+        depends on DPM_WATCHDOG
 config PM_TRACE
        bool
        help
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 463aa6736751..eacb8bd8cab4 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev)
        list_for_each_entry(tmp, &pm_vt_switch_list, head) {
                if (tmp->dev == dev) {
                        list_del(&tmp->head);
+                        kfree(tmp);
                        break;
                }
        }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index a394297f8b2f..8dff9b48075a 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
                        return -EFAULT;
-        } else if (count <= 11) { /* ASCII perhaps? */
+        } else {
-                char ascii_value[11];
-                unsigned long int ulval;
                int ret;
-                if (copy_from_user(ascii_value, buf, count))
+                ret = kstrtos32_from_user(buf, count, 16, &value);
-                        return -EFAULT;
+                if (ret)
+                        return ret;
-                if (count > 10) {
-                        if (ascii_value[10] == '\n')
-                                ascii_value[10] = '\0';
-                        else
-                                return -EINVAL;
-                } else {
-                        ascii_value[count] = '\0';
-                }
-                ret = kstrtoul(ascii_value, 16, &ulval);
-                if (ret) {
-                        pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
-                        return -EINVAL;
-                }
-                value = (s32)lower_32_bits(ulval);
-        } else {
-                return -EINVAL;
        }
        req = filp->private_data;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 98c3b34a4cff..b38109e204af 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void)
 {
        struct memory_bitmap *bm1, *bm2;
-        BUG_ON(!(forbidden_pages_map && free_pages_map));
+        if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
+                return;
        bm1 = forbidden_pages_map;
        bm2 = free_pages_map;
@@ -1402,7 +1403,11 @@ int hibernate_preallocate_memory(void)
         * highmem and non-highmem zones separately.
         */
        pages_highmem = preallocate_image_highmem(highmem / 2);
-        alloc = (count - max_size) - pages_highmem;
+        alloc = count - max_size;
+        if (alloc > pages_highmem)
+                alloc -= pages_highmem;
+        else
+                alloc = 0;
        pages = preallocate_image_memory(alloc, avail_normal);
        if (pages < alloc) {
                /* We have exhausted non-highmem pages, try highmem. */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 957f06164ad1..98d357584cd6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -36,9 +36,9 @@ static struct snapshot_data {
        struct snapshot_handle handle;
        int swap;
        int mode;
-        char frozen;
+        bool frozen;
-        char ready;
+        bool ready;
-        char platform_support;
+        bool platform_support;
        bool free_bitmaps;
 } snapshot_state;
@@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                data->swap = swsusp_resume_device ?
                        swap_type_of(swsusp_resume_device, 0, NULL) : -1;
                data->mode = O_RDONLY;
+                data->free_bitmaps = false;
                error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
                if (error)
                        pm_notifier_call_chain(PM_POST_HIBERNATION);
@@ -93,9 +94,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        if (error)
                atomic_inc(&snapshot_device_available);
-        data->frozen = 0;
+        data->frozen = false;
-        data->ready = 0;
+        data->ready = false;
-        data->platform_support = 0;
+        data->platform_support = false;
 Unlock:
        unlock_system_sleep();
@@ -229,7 +230,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                if (error)
                        thaw_processes();
                else
-                        data->frozen = 1;
+                        data->frozen = true;
                break;
@@ -240,7 +241,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                free_basic_memory_bitmaps();
                data->free_bitmaps = false;
                thaw_processes();
-                data->frozen = 0;
+                data->frozen = false;
                break;
        case SNAPSHOT_CREATE_IMAGE:
@@ -270,7 +271,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_FREE:
                swsusp_free();
                memset(&data->handle, 0, sizeof(struct snapshot_handle));
-                data->ready = 0;
+                data->ready = false;
                /*
                 * It is necessary to thaw kernel threads here, because
                 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
@@ -334,7 +335,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                 * PM_HIBERNATION_PREPARE
                 */
                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-                data->ready = 0;
+                data->ready = false;
                break;
        case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b4e8500afdb3..be7c86bae576 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = {
 #ifdef CONFIG_KEXEC
 /*
- * This appends the listed symbols to /proc/vmcoreinfo
+ * This appends the listed symbols to /proc/vmcore
 *
- * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * /proc/vmcore is used by various utilities, like crash and makedumpfile to
 * obtain access to symbols that are otherwise very difficult to locate.  These
 * symbols are specifically used so that utilities can access and extract the
 * dmesg log from a vmcore file after a crash.
@@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
 {
        ignore_loglevel = 1;
-        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+        pr_info("debug: ignoring loglevel setting.\n");
        return 0;
 }
@@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str)
        pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
                "HZ: %d, loops_per_msec: %llu\n",
                boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
-        return 1;
+        return 0;
 }
-__setup("boot_delay=", boot_delay_setup);
+early_param("boot_delay", boot_delay_setup);
 static void boot_delay_msec(int level)
 {
@@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon;
 static int __init keep_bootcon_setup(char *str)
 {
        keep_bootcon = 1;
-        printk(KERN_INFO "debug: skip boot console de-registration.\n");
+        pr_info("debug: skip boot console de-registration.\n");
        return 0;
 }
@@ -2241,7 +2241,7 @@ void register_console(struct console *newcon)
                /* find the last or real console */
                for_each_console(bcon) {
                        if (!(bcon->flags & CON_BOOT)) {
-                                printk(KERN_INFO "Too late to register bootconsole %s%d\n",
+                                pr_info("Too late to register bootconsole %s%d\n",
                                        newcon->name, newcon->index);
                                return;
                        }
@@ -2358,21 +2358,18 @@ void register_console(struct console *newcon)
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
+        pr_info("%sconsole [%s%d] enabled\n",
+                (newcon->flags & CON_BOOT) ? "boot" : "" ,
+                newcon->name, newcon->index);
        if (bcon &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
            !keep_bootcon) {
-                /* we need to iterate through twice, to make sure we print
+                /* We need to iterate through all boot consoles, to make
-                 * everything out, before we unregister the console(s)
+                 * sure we print everything out, before we unregister them.
                 */
-                printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
-                        newcon->name, newcon->index);
                for_each_console(bcon)
                        if (bcon->flags & CON_BOOT)
                                unregister_console(bcon);
-        } else {
-                printk(KERN_INFO "%sconsole [%s%d] enabled\n",
-                        (newcon->flags & CON_BOOT) ? "boot" : "" ,
-                        newcon->name, newcon->index);
        }
 }
 EXPORT_SYMBOL(register_console);
@@ -2382,6 +2379,10 @@ int unregister_console(struct console *console)
        struct console *a, *b;
        int res;
+        pr_info("%sconsole [%s%d] disabled\n",
+                (console->flags & CON_BOOT) ? "boot" : "" ,
+                console->name, console->index);
        res = _braille_unregister_console(console);
        if (res)
                return res;
@@ -2421,8 +2422,6 @@ static int __init printk_late_init(void)
        for_each_console(con) {
                if (!keep_bootcon && con->flags & CON_BOOT) {
-                        printk(KERN_INFO "turn off boot console %s%d\n",
-                                con->name, con->index);
                        unregister_console(con);
                }
        }
@@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
        if (pending & PRINTK_PENDING_SCHED) {
                char *buf = __get_cpu_var(printk_sched_buf);
-                printk(KERN_WARNING "[sched_delayed] %s", buf);
+                pr_warn("[sched_delayed] %s", buf);
        }
        if (pending & PRINTK_PENDING_WAKEUP)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dd562e9aa2c8..1f4bcb3cc21c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,7 +257,8 @@ ok:
        if (task->mm)
                dumpable = get_dumpable(task->mm);
        rcu_read_lock();
-        if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+        if (dumpable != SUID_DUMP_USER &&
+            !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
                rcu_read_unlock();
                return -EPERM;
        }
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000000..01e9ec37a3e3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
+obj-y += update.o srcu.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_TREE_RCU) += tree.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
+obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index 77131966c4ad..7859a0a3951e 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x)  tracepoint_string(x)
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index 9ed6075dc562..1254f312d024 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
+#include <linux/ftrace_event.h>
 #ifdef CONFIG_RCU_TRACE
 #include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
 #include "rcu.h"
-/* Forward declarations for rcutiny_plugin.h. */
+/* Forward declarations for tiny_plugin.h. */
 struct rcu_ctrlblk;
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
 static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-#include "rcutiny_plugin.h"
+#include "tiny_plugin.h"
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
        if (newval) {
-                RCU_TRACE(trace_rcu_dyntick("--=",
+                RCU_TRACE(trace_rcu_dyntick(TPS("--="),
                                            rcu_dynticks_nesting, newval));
                rcu_dynticks_nesting = newval;
                return;
        }
-        RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
+        RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
+                                    rcu_dynticks_nesting, newval));
        if (!is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+                RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
                                            rcu_dynticks_nesting, newval));
                ftrace_dump(DUMP_ALL);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
 static void rcu_idle_exit_common(long long oldval)
 {
        if (oldval) {
-                RCU_TRACE(trace_rcu_dyntick("++=",
+                RCU_TRACE(trace_rcu_dyntick(TPS("++="),
                                            oldval, rcu_dynticks_nesting));
                return;
        }
-        RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+        RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
        if (!is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+                RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
                          oldval, rcu_dynticks_nesting));
                ftrace_dump(DUMP_ALL);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
 }
 EXPORT_SYMBOL_GPL(rcu_irq_enter);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 /*
 * Test whether RCU thinks that the current CPU is idle.
 */
-int rcu_is_cpu_idle(void)
+bool notrace __rcu_is_watching(void)
 {
-        return !rcu_dynticks_nesting;
+        return rcu_dynticks_nesting;
 }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL(__rcu_is_watching);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 /*
 * Test whether the current CPU was interrupted from idle.  Nested
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        if (&rcp->rcucblist == rcp->donetail) {
                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-                                              ACCESS_ONCE(rcp->rcucblist),
+                                              !!ACCESS_ONCE(rcp->rcucblist),
                                              need_resched(),
                                              is_idle_task(current),
                                              false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+        RCU_TRACE(trace_rcu_batch_end(rcp->name,
+                                      cb_count, 0, need_resched(),
                                      is_idle_task(current),
                                      false));
 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c
index be63101c6175..3929cd451511 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,6 +52,12 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+MODULE_ALIAS("rcutorture");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutorture."
 static int fqs_duration;
 module_param(fqs_duration, int, 0444);
 MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c
index 32618b3fe4e6..dd081987a8ec 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
 #include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
+#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
@@ -56,17 +57,16 @@
 #include <linux/ftrace_event.h>
 #include <linux/suspend.h>
-#include "rcutree.h"
+#include "tree.h"
 #include <trace/events/rcu.h>
 #include "rcu.h"
-/*
+MODULE_ALIAS("rcutree");
- * Strings used in tracepoints need to be exported via the
+#ifdef MODULE_PARAM_PREFIX
- * tracing system such that tools like perf and trace-cmd can
+#undef MODULE_PARAM_PREFIX
- * translate the string address pointers to actual text.
+#endif
- */
+#define MODULE_PARAM_PREFIX "rcutree."
-#define TPS(x)  tracepoint_string(x)
 /* Data structures. */
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
        .dynticks = ATOMIC_INIT(1),
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
@@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
 {
        trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused =
+                        idle_task(smp_processor_id());
                trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
                ftrace_dump(DUMP_ORIG);
@@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user)
        long long oldval;
        struct rcu_dynticks *rdtp;
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -435,7 +436,7 @@ void rcu_idle_enter(void)
        local_irq_save(flags);
        rcu_eqs_enter(false);
-        rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
+        rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -478,7 +479,7 @@ void rcu_irq_exit(void)
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        rdtp->dynticks_nesting--;
        WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
@@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
        rcu_cleanup_after_idle(smp_processor_id());
        trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused =
+                        idle_task(smp_processor_id());
                trace_rcu_dyntick(TPS("Error on exit: not idle task"),
                                  oldval, rdtp->dynticks_nesting);
@@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user)
        struct rcu_dynticks *rdtp;
        long long oldval;
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE(oldval < 0);
        if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -555,7 +557,7 @@ void rcu_idle_exit(void)
        local_irq_save(flags);
        rcu_eqs_exit(false);
-        rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
+        rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -599,7 +601,7 @@ void rcu_irq_enter(void)
        long long oldval;
        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        rdtp->dynticks_nesting++;
        WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
@@ -620,7 +622,7 @@ void rcu_irq_enter(void)
 */
 void rcu_nmi_enter(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        if (rdtp->dynticks_nmi_nesting == 0 &&
            (atomic_read(&rdtp->dynticks) & 0x1))
@@ -642,7 +644,7 @@ void rcu_nmi_enter(void)
 */
 void rcu_nmi_exit(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        if (rdtp->dynticks_nmi_nesting == 0 ||
            --rdtp->dynticks_nmi_nesting != 0)
@@ -655,21 +657,34 @@ void rcu_nmi_exit(void)
 }
 /**
- * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
+ * __rcu_is_watching - are RCU read-side critical sections safe?
+ *
+ * Return true if RCU is watching the running CPU, which means that
+ * this CPU can safely enter RCU read-side critical sections.  Unlike
+ * rcu_is_watching(), the caller of __rcu_is_watching() must have at
+ * least disabled preemption.
+ */
+bool notrace __rcu_is_watching(void)
+{
+        return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+}
+/**
+ * rcu_is_watching - see if RCU thinks that the current CPU is idle
 *
 * If the current CPU is in its idle loop and is neither in an interrupt
 * or NMI handler, return true.
 */
-int rcu_is_cpu_idle(void)
+bool notrace rcu_is_watching(void)
 {
        int ret;
        preempt_disable();
-        ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+        ret = __rcu_is_watching();
        preempt_enable();
        return ret;
 }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL_GPL(rcu_is_watching);
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
@@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
        if (in_nmi())
                return 1;
        preempt_disable();
-        rdp = &__get_cpu_var(rcu_sched_data);
+        rdp = this_cpu_ptr(&rcu_sched_data);
        rnp = rdp->mynode;
        ret = (rdp->grpmask & rnp->qsmaskinit) ||
              !rcu_scheduler_fully_active;
@@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
 */
 static int rcu_is_cpu_rrupt_from_idle(void)
 {
-        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
+        return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
 }
 /*
@@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
-        rsp->gp_start = jiffies;
+        unsigned long j = ACCESS_ONCE(jiffies);
-        rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+        rsp->gp_start = j;
+        smp_wmb(); /* Record start time before stall time. */
+        rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
 }
 /*
@@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        force_quiescent_state(rsp);  /* Kick them all. */
 }
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
 static void print_cpu_stall(struct rcu_state *rsp)
 {
        int cpu;
@@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
                                     3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        set_need_resched();  /* kick ourselves to get things going. */
+        /*
+         * Attempt to revive the RCU machinery by forcing a context switch.
+         *
+         * A context switch would normally allow the RCU state machine to make
+         * progress and it could be we're stuck in kernel space without context
+         * switches for an entirely unreasonable amount of time.
+         */
+        resched_cpu(smp_processor_id());
 }
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
+        unsigned long completed;
+        unsigned long gpnum;
+        unsigned long gps;
        unsigned long j;
        unsigned long js;
        struct rcu_node *rnp;
-        if (rcu_cpu_stall_suppress)
+        if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
                return;
        j = ACCESS_ONCE(jiffies);
+        /*
+         * Lots of memory barriers to reject false positives.
+         *
+         * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
+         * then rsp->gp_start, and finally rsp->completed.  These values
+         * are updated in the opposite order with memory barriers (or
+         * equivalent) during grace-period initialization and cleanup.
+         * Now, a false positive can occur if we get an new value of
+         * rsp->gp_start and a old value of rsp->jiffies_stall.  But given
+         * the memory barriers, the only way that this can happen is if one
+         * grace period ends and another starts between these two fetches.
+         * Detect this by comparing rsp->completed with the previous fetch
+         * from rsp->gpnum.
+         *
+         * Given this check, comparisons of jiffies, rsp->jiffies_stall,
+         * and rsp->gp_start suffice to forestall false positives.
+         */
+        gpnum = ACCESS_ONCE(rsp->gpnum);
+        smp_rmb(); /* Pick up ->gpnum first... */
        js = ACCESS_ONCE(rsp->jiffies_stall);
+        smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+        gps = ACCESS_ONCE(rsp->gp_start);
+        smp_rmb(); /* ...and finally ->gp_start before ->completed. */
+        completed = ACCESS_ONCE(rsp->completed);
+        if (ULONG_CMP_GE(completed, gpnum) ||
+            ULONG_CMP_LT(j, js) ||
+            ULONG_CMP_GE(gps, js))
+                return; /* No stall or GP completed since entering function. */
        rnp = rdp->mynode;
        if (rcu_gp_in_progress(rsp) &&
-            (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
+            (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
@@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Initialize a new grace period.
+ * Initialize a new grace period.  Return 0 if no grace period required.
 */
 static int rcu_gp_init(struct rcu_state *rsp)
 {
@@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp)
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
+        if (rsp->gp_flags == 0) {
+                /* Spurious wakeup, tell caller to go back to sleep.  */
+                raw_spin_unlock_irq(&rnp->lock);
+                return 0;
+        }
        rsp->gp_flags = 0; /* Clear all flags: New grace period. */
-        if (rcu_gp_in_progress(rsp)) {
+        if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
-                /* Grace period already in progress, don't start another.  */
+                /*
+                 * Grace period already in progress, don't start another.
+                 * Not supposed to be able to happen.
+                 */
                raw_spin_unlock_irq(&rnp->lock);
                return 0;
        }
        /* Advance to a new grace period and initialize state. */
+        record_gp_stall_check_time(rsp);
+        smp_wmb(); /* Record GP times before starting GP. */
        rsp->gpnum++;
        trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
-        record_gp_stall_check_time(rsp);
        raw_spin_unlock_irq(&rnp->lock);
        /* Exclude any concurrent CPU-hotplug operations. */
@@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 /*
 * Do one round of quiescent-state forcing.
 */
-int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 {
        int fqs_state = fqs_state_in;
        bool isidle = false;
@@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        rsp->fqs_state = RCU_GP_IDLE;
        rdp = this_cpu_ptr(rsp->rda);
        rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
-        if (cpu_needs_another_gp(rsp, rdp))
+        if (cpu_needs_another_gp(rsp, rdp)) {
-                rsp->gp_flags = 1;
+                rsp->gp_flags = RCU_GP_FLAG_INIT;
+                trace_rcu_grace_period(rsp->name,
+                                       ACCESS_ONCE(rsp->gpnum),
+                                       TPS("newreq"));
+        }
        raw_spin_unlock_irq(&rnp->lock);
 }
@@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 static int __noreturn rcu_gp_kthread(void *arg)
 {
        int fqs_state;
+        int gf;
        unsigned long j;
        int ret;
        struct rcu_state *rsp = arg;
@@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
                /* Handle grace-period start. */
                for (;;) {
+                        trace_rcu_grace_period(rsp->name,
+                                               ACCESS_ONCE(rsp->gpnum),
+                                               TPS("reqwait"));
                        wait_event_interruptible(rsp->gp_wq,
-                                                 rsp->gp_flags &
+                                                 ACCESS_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
-                        if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
+                        if (rcu_gp_init(rsp))
-                            rcu_gp_init(rsp))
                                break;
                        cond_resched();
                        flush_signals(current);
+                        trace_rcu_grace_period(rsp->name,
+                                               ACCESS_ONCE(rsp->gpnum),
+                                               TPS("reqwaitsig"));
                }
                /* Handle quiescent-state forcing. */
@@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        j = HZ;
                        jiffies_till_first_fqs = HZ;
                }
+                ret = 0;
                for (;;) {
-                        rsp->jiffies_force_qs = jiffies + j;
+                        if (!ret)
+                                rsp->jiffies_force_qs = jiffies + j;
+                        trace_rcu_grace_period(rsp->name,
+                                               ACCESS_ONCE(rsp->gpnum),
+                                               TPS("fqswait"));
                        ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                        (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
+                                        ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+                                         RCU_GP_FLAG_FQS) ||
                                        (!ACCESS_ONCE(rnp->qsmask) &&
                                         !rcu_preempt_blocked_readers_cgp(rnp)),
                                        j);
@@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
                            !rcu_preempt_blocked_readers_cgp(rnp))
                                break;
                        /* If time for quiescent-state forcing, do it. */
-                        if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
+                        if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
+                            (gf & RCU_GP_FLAG_FQS)) {
+                                trace_rcu_grace_period(rsp->name,
+                                                       ACCESS_ONCE(rsp->gpnum),
+                                                       TPS("fqsstart"));
                                fqs_state = rcu_gp_fqs(rsp, fqs_state);
+                                trace_rcu_grace_period(rsp->name,
+                                                       ACCESS_ONCE(rsp->gpnum),
+                                                       TPS("fqsend"));
                                cond_resched();
                        } else {
                                /* Deal with stray signal. */
                                cond_resched();
                                flush_signals(current);
+                                trace_rcu_grace_period(rsp->name,
+                                                       ACCESS_ONCE(rsp->gpnum),
+                                                       TPS("fqswaitsig"));
                        }
                        j = jiffies_till_next_fqs;
                        if (j > HZ) {
@@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                return;
        }
        rsp->gp_flags = RCU_GP_FLAG_INIT;
+        trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+                               TPS("newreq"));
        /*
         * We can't do wakeups while holding the rnp->lock, as that
@@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
         * If called from an extended quiescent state, invoke the RCU
         * core in order to force a re-evaluation of RCU's idleness.
         */
-        if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+        if (!rcu_is_watching() && cpu_online(smp_processor_id()))
                invoke_rcu_core();
        /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
        for_each_rcu_flavor(rsp) {
                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (rdp->qlen != rdp->qlen_lazy)
+                if (!rdp->nxtlist)
+                        continue;
+                hc = true;
+                if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
                        al = false;
-                if (rdp->nxtlist)
+                        break;
-                        hc = true;
+                }
        }
        if (all_lazy)
                *all_lazy = al;
@@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 /*
 * Compute the rcu_node tree geometry from kernel parameters.  This cannot
- * replace the definitions in rcutree.h because those are needed to size
+ * replace the definitions in tree.h because those are needed to size
 * the ->node array in the rcu_state structure.
 */
 static void __init rcu_init_geometry(void)
@@ -3295,8 +3397,8 @@ void __init rcu_init(void)
        rcu_bootup_announce();
        rcu_init_geometry();
-        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
+        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        __rcu_init_preempt();
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
@@ -3311,4 +3413,4 @@ void __init rcu_init(void)
                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 }
-#include "rcutree_plugin.h"
+#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index 5f97eab602cd..52be957c9fe2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -104,6 +104,8 @@ struct rcu_dynticks {
                                    /* idle-period nonlazy_posted snapshot. */
        unsigned long last_accelerate;
                                    /* Last jiffy CBs were accelerated. */
+        unsigned long last_advance_all;
+                                    /* Last jiffy CBs were all advanced. */
        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h
index 130c97b027f2..08a765232432 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
-#include "time/tick-internal.h"
+#include "../time/tick-internal.h"
 #define RCU_KTHREAD_PRIO 1
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
 #ifdef CONFIG_RCU_NOCB_CPU_ALL
        pr_info("\tOffload RCU callbacks from all CPUs\n");
-        cpumask_setall(rcu_nocb_mask);
+        cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
        if (have_rcu_nocb_mask) {
+                if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+                        pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+                        cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+                                    rcu_nocb_mask);
+                }
                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
                pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
                if (rcu_nocb_poll)
@@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
 static void rcu_preempt_do_callbacks(void)
 {
-        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+        rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
 }
 #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1128,7 +1133,7 @@ void exit_rcu(void)
 #ifdef CONFIG_RCU_BOOST
-#include "rtmutex_common.h"
+#include "../locking/rtmutex_common.h"
 #ifdef CONFIG_RCU_TRACE
@@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
 */
 static bool rcu_is_callbacks_kthread(void)
 {
-        return __get_cpu_var(rcu_cpu_kthread_task) == current;
+        return __this_cpu_read(rcu_cpu_kthread_task) == current;
 }
 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 static void rcu_kthread_do_work(void)
 {
-        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+        rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
-        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+        rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
        rcu_preempt_do_callbacks();
 }
@@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
 static int rcu_cpu_kthread_should_run(unsigned int cpu)
 {
-        return __get_cpu_var(rcu_cpu_has_work);
+        return __this_cpu_read(rcu_cpu_has_work);
 }
 /*
@@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
 */
 static void rcu_cpu_kthread(unsigned int cpu)
 {
-        unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+        unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
-        char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+        char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
        int spincnt;
        for (spincnt = 0; spincnt < 10; spincnt++) {
@@ -1627,20 +1632,26 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_enabled;
+extern int tick_nohz_active;
 /*
- * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Try to advance callbacks for all flavors of RCU on the current CPU, but
- * Afterwards, if there are any callbacks ready for immediate invocation,
+ * only if it has been awhile since the last time we did so.  Afterwards,
- * return true.
+ * if there are any callbacks ready for immediate invocation, return true.
 */
 static bool rcu_try_advance_all_cbs(void)
 {
        bool cbs_ready = false;
        struct rcu_data *rdp;
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+        /* Exit early if we advanced recently. */
+        if (jiffies == rdtp->last_advance_all)
+                return 0;
+        rdtp->last_advance_all = jiffies;
        for_each_rcu_flavor(rsp) {
                rdp = this_cpu_ptr(rsp->rda);
                rnp = rdp->mynode;
@@ -1718,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu)
        int tne;
        /* Handle nohz enablement switches conservatively. */
-        tne = ACCESS_ONCE(tick_nohz_enabled);
+        tne = ACCESS_ONCE(tick_nohz_active);
        if (tne != rdtp->tick_nohz_enabled_snap) {
                if (rcu_cpu_has_callbacks(cpu, NULL))
                        invoke_rcu_core(); /* force nohz to see update. */
@@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
         */
        if (rdtp->all_lazy &&
            rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+                rdtp->all_lazy = false;
+                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
                invoke_rcu_core();
                return;
        }
@@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
-        struct rcu_data *rdp;
-        struct rcu_state *rsp;
        if (rcu_is_nocb_cpu(cpu))
                return;
-        rcu_try_advance_all_cbs();
+        if (rcu_try_advance_all_cbs())
-        for_each_rcu_flavor(rsp) {
+                invoke_rcu_core();
-                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (cpu_has_callbacks_ready_to_invoke(rdp))
-                        invoke_rcu_core();
-        }
 }
 /*
@@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        /* If we are not being polled and there is a kthread, awaken it ... */
        t = ACCESS_ONCE(rdp->nocb_kthread);
-        if (rcu_nocb_poll | !t)
+        if (rcu_nocb_poll || !t) {
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                    TPS("WakeNotPoll"));
                return;
+        }
        len = atomic_long_read(&rdp->nocb_q_count);
        if (old_rhpp == &rdp->nocb_head) {
                wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
                rdp->qlen_last_fqs_check = 0;
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                wake_up_process(t); /* ... or if many callbacks queued. */
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
+        } else {
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
        }
        return;
 }
@@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
        if (__is_kfree_rcu_offset((unsigned long)rhp->func))
                trace_rcu_kfree_callback(rdp->rsp->name, rhp,
                                         (unsigned long)rhp->func,
-                                         rdp->qlen_lazy, rdp->qlen);
+                                         -atomic_long_read(&rdp->nocb_q_count_lazy),
+                                         -atomic_long_read(&rdp->nocb_q_count));
        else
                trace_rcu_callback(rdp->rsp->name, rhp,
-                                   rdp->qlen_lazy, rdp->qlen);
+                                   -atomic_long_read(&rdp->nocb_q_count_lazy),
+                                   -atomic_long_read(&rdp->nocb_q_count));
        return 1;
 }
@@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 static int rcu_nocb_kthread(void *arg)
 {
        int c, cl;
+        bool firsttime = 1;
        struct rcu_head *list;
        struct rcu_head *next;
        struct rcu_head **tail;
@@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
        /* Each pass through this loop invokes one batch of callbacks */
        for (;;) {
                /* If not polling, wait for next batch of callbacks. */
-                if (!rcu_nocb_poll)
+                if (!rcu_nocb_poll) {
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("Sleep"));
                        wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+                } else if (firsttime) {
+                        firsttime = 0;
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("Poll"));
+                }
                list = ACCESS_ONCE(rdp->nocb_head);
                if (!list) {
+                        if (!rcu_nocb_poll)
+                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                    TPS("WokeEmpty"));
                        schedule_timeout_interruptible(1);
                        flush_signals(current);
                        continue;
                }
+                firsttime = 1;
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                    TPS("WokeNonEmpty"));
                /*
                 * Extract queued callbacks, update counts, and wait
@@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
                        next = list->next;
                        /* Wait for enqueuing to complete, if needed. */
                        while (next == NULL && &list->next != tail) {
+                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                    TPS("WaitQueue"));
                                schedule_timeout_interruptible(1);
+                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                    TPS("WokeQueue"));
                                next = list->next;
                        }
                        debug_rcu_head_unqueue(list);
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index cf6c17412932..3596797b7e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
 #include <linux/seq_file.h>
 #define RCU_TREE_NONCORE
-#include "rcutree.h"
+#include "tree.h"
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index b02a339836b4..6cb3dff89e2b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
 #include "rcu.h"
+MODULE_ALIAS("rcupdate");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcupdate."
 module_param(rcu_expedited, int, 0);
 #ifdef CONFIG_PREEMPT_RCU
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
-        if (rcu_is_cpu_idle())
+        if (!rcu_is_watching())
                return 0;
        if (!rcu_lockdep_current_cpu_online())
                return 0;
@@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #endif
 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
 module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
-static void migrate_to_reboot_cpu(void)
+void migrate_to_reboot_cpu(void)
 {
        /* The boot cpu is always logical cpu 0 */
        int cpu = reboot_cpu;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..7b621409cf15 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += wait.o completion.o
 obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000000..a63f4dc27909
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+#include <linux/sched.h>
+#include <linux/completion.h>
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        x->done++;
+        __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        x->done += UINT_MAX/2;
+        __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+static inline long __sched
+do_wait_for_common(struct completion *x,
+                   long (*action)(long), long timeout, int state)
+{
+        if (!x->done) {
+                DECLARE_WAITQUEUE(wait, current);
+                __add_wait_queue_tail_exclusive(&x->wait, &wait);
+                do {
+                        if (signal_pending_state(state, current)) {
+                                timeout = -ERESTARTSYS;
+                                break;
+                        }
+                        __set_current_state(state);
+                        spin_unlock_irq(&x->wait.lock);
+                        timeout = action(timeout);
+                        spin_lock_irq(&x->wait.lock);
+                } while (!x->done && timeout);
+                __remove_wait_queue(&x->wait, &wait);
+                if (!x->done)
+                        return timeout;
+        }
+        x->done--;
+        return timeout ?: 1;
+}
+static inline long __sched
+__wait_for_common(struct completion *x,
+                  long (*action)(long), long timeout, int state)
+{
+        might_sleep();
+        spin_lock_irq(&x->wait.lock);
+        timeout = do_wait_for_common(x, action, timeout, state);
+        spin_unlock_irq(&x->wait.lock);
+        return timeout;
+}
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+        wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+        return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+        if (t == -ERESTARTSYS)
+                return t;
+        return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+                                          unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+        if (t == -ERESTARTSYS)
+                return t;
+        return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+                                     unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+/**
+ *      try_wait_for_completion - try to decrement a completion without blocking
+ *      @x:     completion structure
+ *
+ *      Return: 0 if a decrement cannot be done without blocking
+ *               1 if a decrement succeeded.
+ *
+ *      If a completion is being used as a counting completion,
+ *      attempt to decrement the counter without blocking. This
+ *      enables us to avoid waiting if the resource the completion
+ *      is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+        unsigned long flags;
+        int ret = 1;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        if (!x->done)
+                ret = 0;
+        else
+                x->done--;
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+/**
+ *      completion_done - Test to see if a completion has any waiters
+ *      @x:     completion structure
+ *
+ *      Return: 0 if there are waiters (wait_for_completion() in progress)
+ *               1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+        unsigned long flags;
+        int ret = 1;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        if (!x->done)
+                ret = 0;
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..a88f4a485c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
-#ifdef CONFIG_SMP
 void resched_task(struct task_struct *p)
 {
        int cpu;
-        assert_raw_spin_locked(&task_rq(p)->lock);
+        lockdep_assert_held(&task_rq(p)->lock);
        if (test_tsk_need_resched(p))
                return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
        set_tsk_need_resched(p);
        cpu = task_cpu(p);
-        if (cpu == smp_processor_id())
+        if (cpu == smp_processor_id()) {
+                set_preempt_need_resched();
                return;
+        }
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
+#ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
        }
 }
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-        assert_raw_spin_locked(&task_rq(p)->lock);
-        set_tsk_need_resched(p);
-}
 #endif /* CONFIG_SMP */
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_queued(p);
+        sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_dequeued(p);
+        sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                        !(task_preempt_count(p) & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+        if (p->on_rq) {
+                struct rq *src_rq, *dst_rq;
+                src_rq = task_rq(p);
+                dst_rq = cpu_rq(cpu);
+                deactivate_task(src_rq, p, 0);
+                set_task_cpu(p, cpu);
+                activate_task(dst_rq, p, 0);
+                check_preempt_curr(dst_rq, p, 0);
+        } else {
+                /*
+                 * Task isn't running anymore; make it appear like we migrated
+                 * it before it went to sleep. This means on wakeup we make the
+                 * previous cpu our targer instead of where it really is.
+                 */
+                p->wake_cpu = cpu;
+        }
+}
+struct migration_swap_arg {
+        struct task_struct *src_task, *dst_task;
+        int src_cpu, dst_cpu;
+};
+static int migrate_swap_stop(void *data)
+{
+        struct migration_swap_arg *arg = data;
+        struct rq *src_rq, *dst_rq;
+        int ret = -EAGAIN;
+        src_rq = cpu_rq(arg->src_cpu);
+        dst_rq = cpu_rq(arg->dst_cpu);
+        double_raw_lock(&arg->src_task->pi_lock,
+                        &arg->dst_task->pi_lock);
+        double_rq_lock(src_rq, dst_rq);
+        if (task_cpu(arg->dst_task) != arg->dst_cpu)
+                goto unlock;
+        if (task_cpu(arg->src_task) != arg->src_cpu)
+                goto unlock;
+        if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+                goto unlock;
+        if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+                goto unlock;
+        __migrate_swap_task(arg->src_task, arg->dst_cpu);
+        __migrate_swap_task(arg->dst_task, arg->src_cpu);
+        ret = 0;
+unlock:
+        double_rq_unlock(src_rq, dst_rq);
+        raw_spin_unlock(&arg->dst_task->pi_lock);
+        raw_spin_unlock(&arg->src_task->pi_lock);
+        return ret;
+}
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+        struct migration_swap_arg arg;
+        int ret = -EINVAL;
+        arg = (struct migration_swap_arg){
+                .src_task = cur,
+                .src_cpu = task_cpu(cur),
+                .dst_task = p,
+                .dst_cpu = task_cpu(p),
+        };
+        if (arg.src_cpu == arg.dst_cpu)
+                goto out;
+        /*
+         * These three tests are all lockless; this is OK since all of them
+         * will be re-checked with proper locks held further down the line.
+         */
+        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+                goto out;
+        if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+                goto out;
+        if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+                goto out;
+        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+out:
+        return ret;
+}
 struct migration_arg {
        struct task_struct *task;
        int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (rq->idle_stamp) {
                u64 delta = rq_clock(rq) - rq->idle_stamp;
-                u64 max = 2*sysctl_sched_migration_cost;
+                u64 max = 2*rq->max_idle_balance_cost;
-                if (delta > max)
+                update_avg(&rq->avg_idle, delta);
+                if (rq->avg_idle > max)
                        rq->avg_idle = max;
-                else
-                        update_avg(&rq->avg_idle, delta);
                rq->idle_stamp = 0;
        }
 #endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
 void scheduler_ipi(void)
 {
+        /*
+         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+         * TIF_NEED_RESCHED remotely (for the first time) will also send
+         * this IPI.
+         */
+        if (tif_need_resched())
+                set_preempt_need_resched();
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
                        && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (p->sched_class->task_waking)
                p->sched_class->task_waking(p);
-        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 *
 * __sched_fork() is basic setup used by init_idle() too:
 */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        p->on_rq                        = 0;
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_NUMA_BALANCING
        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                p->mm->numa_next_reset = jiffies;
                p->mm->numa_scan_seq = 0;
        }
+        if (clone_flags & CLONE_VM)
+                p->numa_preferred_nid = current->numa_preferred_nid;
+        else
+                p->numa_preferred_nid = -1;
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
+        p->numa_faults = NULL;
+        p->numa_faults_buffer = NULL;
+        INIT_LIST_HEAD(&p->numa_entry);
+        p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
-        __sched_fork(p);
+        __sched_fork(clone_flags, p);
        /*
         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT_COUNT
+        init_task_preempt_count(p);
-        /* Want to start with kernel preemption disabled. */
-        task_thread_info(p)->preempt_count = 1;
-#endif
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         */
-        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        trace_sched_switch(prev, next);
-        sched_info_switch(prev, next);
+        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+                task_numa_free(prev);
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
        int dest_cpu;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
@@ -2140,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        struct rq *rq;
        u64 ns = 0;
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+        /*
+         * 64-bit doesn't need locks to atomically read a 64bit value.
+         * So we have a optimization chance when the task's delta_exec is 0.
+         * Reading ->on_cpu is racy, but this is ok.
+         *
+         * If we race with it leaving cpu, we'll take a lock. So we're correct.
+         * If we race with it entering cpu, unaccounted time is 0. This is
+         * indistinguishable from the read occurring a few cycles earlier.
+         */
+        if (!p->on_cpu)
+                return p->se.sum_exec_runtime;
+#endif
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
        task_rq_unlock(rq, p, &flags);
@@ -2215,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2224,7 +2351,7 @@ void __kprobes add_preempt_count(int val)
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
 #endif
-        preempt_count() += val;
+        __preempt_count_add(val);
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
@@ -2235,9 +2362,9 @@ void __kprobes add_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2255,9 +2382,9 @@ void __kprobes sub_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-        preempt_count() -= val;
+        __preempt_count_sub(val);
 }
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
 #endif
@@ -2430,6 +2557,7 @@ need_resched:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
+        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
@@ -2520,9 +2648,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
                return;
        do {
-                add_preempt_count_notrace(PREEMPT_ACTIVE);
+                __preempt_count_add(PREEMPT_ACTIVE);
                __schedule();
-                sub_preempt_count_notrace(PREEMPT_ACTIVE);
+                __preempt_count_sub(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -2532,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
        } while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
 /*
 * this is the entry point to schedule() from kernel preemption
@@ -2541,20 +2670,19 @@ EXPORT_SYMBOL(preempt_schedule);
 */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-        struct thread_info *ti = current_thread_info();
        enum ctx_state prev_state;
        /* Catch callers which need to be fixed */
-        BUG_ON(ti->preempt_count || !irqs_disabled());
+        BUG_ON(preempt_count() || !irqs_disabled());
        prev_state = exception_enter();
        do {
-                add_preempt_count(PREEMPT_ACTIVE);
+                __preempt_count_add(PREEMPT_ACTIVE);
                local_irq_enable();
                __schedule();
                local_irq_disable();
-                sub_preempt_count(PREEMPT_ACTIVE);
+                __preempt_count_sub(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -2566,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
        exception_exit(prev_state);
 }
-#endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
 {
@@ -2575,393 +2701,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 }
 EXPORT_SYMBOL(default_wake_function);
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, int wake_flags, void *key)
-{
-        wait_queue_t *curr, *next;
-        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
-                unsigned flags = curr->flags;
-                if (curr->func(curr, mode, wake_flags, key) &&
-                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-                        break;
-        }
-}
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, void *key)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, 0, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
-{
-        __wake_up_common(q, mode, nr, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
-        __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, void *key)
-{
-        unsigned long flags;
-        int wake_flags = WF_SYNC;
-        if (unlikely(!q))
-                return;
-        if (unlikely(nr_exclusive != 1))
-                wake_flags = 0;
-        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
-        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
-/**
- * complete: - signals a single thread waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done++;
-        __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done += UINT_MAX/2;
-        __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-static inline long __sched
-do_wait_for_common(struct completion *x,
-                   long (*action)(long), long timeout, int state)
-{
-        if (!x->done) {
-                DECLARE_WAITQUEUE(wait, current);
-                __add_wait_queue_tail_exclusive(&x->wait, &wait);
-                do {
-                        if (signal_pending_state(state, current)) {
-                                timeout = -ERESTARTSYS;
-                                break;
-                        }
-                        __set_current_state(state);
-                        spin_unlock_irq(&x->wait.lock);
-                        timeout = action(timeout);
-                        spin_lock_irq(&x->wait.lock);
-                } while (!x->done && timeout);
-                __remove_wait_queue(&x->wait, &wait);
-                if (!x->done)
-                        return timeout;
-        }
-        x->done--;
-        return timeout ?: 1;
-}
-static inline long __sched
-__wait_for_common(struct completion *x,
-                  long (*action)(long), long timeout, int state)
-{
-        might_sleep();
-        spin_lock_irq(&x->wait.lock);
-        timeout = do_wait_for_common(x, action, timeout, state);
-        spin_unlock_irq(&x->wait.lock);
-        return timeout;
-}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
-        return __wait_for_common(x, schedule_timeout, timeout, state);
-}
-static long __sched
-wait_for_common_io(struct completion *x, long timeout, int state)
-{
-        return __wait_for_common(x, io_schedule_timeout, timeout, state);
-}
-/**
- * wait_for_completion: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
-        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
- * till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-/**
- * wait_for_completion_io: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
- */
-void __sched wait_for_completion_io(struct completion *x)
-{
-        wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io);
-/**
- * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
- *
- * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
- * till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
-{
-        return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io_timeout);
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x:  holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
-        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
-        if (t == -ERESTARTSYS)
-                return t;
-        return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
- * or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
-                                          unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
-        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-        if (t == -ERESTARTSYS)
-                return t;
-        return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
- * or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
-                                     unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-/**
- *      try_wait_for_completion - try to decrement a completion without blocking
- *      @x:     completion structure
- *
- *      Return: 0 if a decrement cannot be done without blocking
- *               1 if a decrement succeeded.
- *
- *      If a completion is being used as a counting completion,
- *      attempt to decrement the counter without blocking. This
- *      enables us to avoid waiting if the resource the completion
- *      is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        else
-                x->done--;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-/**
- *      completion_done - Test to see if a completion has any waiters
- *      @x:     completion structure
- *
- *      Return: 0 if there are waiters (wait_for_completion() in progress)
- *               1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
@@ -3598,13 +3337,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        struct task_struct *p;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (!p) {
                rcu_read_unlock();
-                put_online_cpus();
                return -ESRCH;
        }
@@ -3661,7 +3398,6 @@ out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
 out_put_task:
        put_task_struct(p);
-        put_online_cpus();
        return retval;
 }
@@ -3706,7 +3442,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        unsigned long flags;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        retval = -ESRCH;
@@ -3719,12 +3454,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
-        put_online_cpus();
        return retval;
 }
@@ -3794,16 +3528,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
-static inline int should_resched(void)
-{
-        return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
 static void __cond_resched(void)
 {
-        add_preempt_count(PREEMPT_ACTIVE);
+        __preempt_count_add(PREEMPT_ACTIVE);
        __schedule();
-        sub_preempt_count(PREEMPT_ACTIVE);
+        __preempt_count_sub(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
@@ -4186,7 +3915,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_lock_irqsave(&rq->lock, flags);
-        __sched_fork(idle);
+        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
@@ -4212,7 +3941,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-        task_thread_info(idle)->preempt_count = 0;
+        init_idle_preempt_count(idle, cpu);
        /*
         * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4075,53 @@ fail:
        return ret;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+        struct migration_arg arg = { p, target_cpu };
+        int curr_cpu = task_cpu(p);
+        if (curr_cpu == target_cpu)
+                return 0;
+        if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+                return -EINVAL;
+        /* TODO: This is not properly updating schedstats */
+        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+        struct rq *rq;
+        unsigned long flags;
+        bool on_rq, running;
+        rq = task_rq_lock(p, &flags);
+        on_rq = p->on_rq;
+        running = task_current(rq, p);
+        if (on_rq)
+                dequeue_task(rq, p, 0);
+        if (running)
+                p->sched_class->put_prev_task(rq, p);
+        p->numa_preferred_nid = nid;
+        if (running)
+                p->sched_class->set_curr_task(rq);
+        if (on_rq)
+                enqueue_task(rq, p, 0);
+        task_rq_unlock(rq, p, &flags);
+}
+#endif
 /*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
@@ -4985,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                cpumask_clear_cpu(rq->cpu, old_rd->span);
                /*
-                 * If we dont want to free the old_rt yet then
+                 * If we dont want to free the old_rd yet then
                 * set old_rd to NULL to skip the freeing later
                 * in this function:
                 */
@@ -5119,10 +4895,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
        struct sched_domain *sd;
+        struct sched_domain *busy_sd = NULL;
        int id = cpu;
        int size = 1;
@@ -5130,11 +4910,19 @@ static void update_top_cache_domain(int cpu)
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
+                busy_sd = sd->parent; /* sd_busy */
        }
+        rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+        sd = lowest_flag_domain(cpu, SD_NUMA);
+        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+        sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+        rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 /*
@@ -5325,6 +5113,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                sg->sgp->power_orig = sg->sgp->power;
                /*
                 * Make sure the first group of this domain contains the
@@ -5654,6 +5443,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                        | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
@@ -6335,14 +6125,17 @@ void __init sched_init_smp(void)
        sched_init_numa();
-        get_online_cpus();
+        /*
+         * There's no userspace yet to cause hotplug operations; hence all the
+         * cpu masks are stable and all blatant races in the below code cannot
+         * happen.
+         */
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
-        put_online_cpus();
        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6298,7 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
                INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7277,7 +7071,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        runtime_enabled = quota != RUNTIME_INF;
        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
-        account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+        /*
+         * If we need to toggle cfs_bandwidth_used, off->on must occur
+         * before making related changes, and on->off must occur afterwards
+         */
+        if (runtime_enabled && !runtime_was_enabled)
+                cfs_bandwidth_usage_inc();
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
@@ -7303,6 +7102,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                        unthrottle_cfs_rq(cfs_rq);
                raw_spin_unlock_irq(&rq->lock);
        }
+        if (runtime_was_enabled && !runtime_enabled)
+                cfs_bandwidth_usage_dec();
 out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 196559994f7c..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include <linux/mempolicy.h>
 #include "sched.h"
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+#endif
 #ifdef CONFIG_CGROUP_SCHED
        SEQ_printf(m, " %s", task_group_path(task_group(p)));
 #endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
-                if (!p->on_rq || task_cpu(p) != rq_cpu)
+                if (task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+                        cfs_rq->tg->cfs_bandwidth.timer_active);
+        SEQ_printf(m, "  .%-30s: %d\n", "throttled",
+                        cfs_rq->throttled);
+        SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
+                        cfs_rq->throttle_count);
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
        cpu_clk = local_clock();
        local_irq_restore(flags);
-        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+        SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
 __initcall(init_sched_debug_procfs);
+#define __P(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+        struct mempolicy *pol;
+        int node, i;
+        if (p->mm)
+                P(mm->numa_scan_seq);
+        task_lock(p);
+        pol = p->mempolicy;
+        if (pol && !(pol->flags & MPOL_F_MORON))
+                pol = NULL;
+        mpol_get(pol);
+        task_unlock(p);
+        SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+        for_each_online_node(node) {
+                for (i = 0; i < 2; i++) {
+                        unsigned long nr_faults = -1;
+                        int cpu_current, home_node;
+                        if (p->numa_faults)
+                                nr_faults = p->numa_faults[2*node + i];
+                        cpu_current = !i ? (task_node(p) == node) :
+                                (pol && node_isset(node, pol->v.nodes));
+                        home_node = (p->numa_preferred_nid == node);
+                        SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                                i, node, cpu_current, home_node, nr_faults);
+                }
+        }
+        mpol_put(pol);
+#endif
+}
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
        unsigned long nr_switches;
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                SEQ_printf(m, "%-45s:%21Ld\n",
                           "clock-delta", (long long)(t1-t0));
        }
+        sched_show_numa(p, m);
 }
 void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..e64b0794060e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
        update_sysctl();
 }
-#if BITS_PER_LONG == 32
+#define WMULT_CONST     (~0U)
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
 #define WMULT_SHIFT     32
-/*
+static void __update_inv_weight(struct load_weight *lw)
- * Shift right and round:
+{
- */
+        unsigned long w;
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+        if (likely(lw->inv_weight))
+                return;
+        w = scale_load_down(lw->weight);
+        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                lw->inv_weight = 1;
+        else if (unlikely(!w))
+                lw->inv_weight = WMULT_CONST;
+        else
+                lw->inv_weight = WMULT_CONST / w;
+}
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
-static unsigned long
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
 {
-        u64 tmp;
+        u64 fact = scale_load_down(weight);
+        int shift = WMULT_SHIFT;
-        /*
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
+        __update_inv_weight(lw);
-                unsigned long w = scale_load_down(lw->weight);
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+        if (unlikely(fact >> 32)) {
-                        lw->inv_weight = 1;
+                while (fact >> 32) {
-                else if (unlikely(!w))
+                        fact >>= 1;
-                        lw->inv_weight = WMULT_CONST;
+                        shift--;
-                else
+                }
-                        lw->inv_weight = WMULT_CONST / w;
        }
-        /*
+        /* hint to use a 32x32->64 mul */
-         * Check whether we'd overflow the 64-bit multiplication:
+        fact = (u64)(u32)fact * lw->inv_weight;
-         */
-        if (unlikely(tmp > WMULT_CONST))
+        while (fact >> 32) {
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                fact >>= 1;
-                        WMULT_SHIFT/2);
+                shift--;
-        else
+        }
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+        return mul_u64_u32_shr(delta_exec, fact, shift);
 }
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
 * delta /= w
 */
-static inline unsigned long
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
        return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
                }
-                slice = calc_delta_mine(slice, se->load.weight, load);
+                slice = __calc_delta(slice, se->load.weight, load);
        }
        return slice;
 }
@@ -681,6 +682,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
@@ -701,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 /*
- * Update the current task's runtime statistics. Skip current tasks that
+ * Update the current task's runtime statistics.
- * are not in our scheduling class.
 */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-              unsigned long delta_exec)
-{
-        unsigned long delta_exec_weighted;
-        schedstat_set(curr->statistics.exec_max,
-                      max((u64)delta_exec, curr->statistics.exec_max));
-        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        curr->vruntime += delta_exec_weighted;
-        update_min_vruntime(cfs_rq);
-}
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
        u64 now = rq_clock_task(rq_of(cfs_rq));
-        unsigned long delta_exec;
+        u64 delta_exec;
        if (unlikely(!curr))
                return;
-        /*
+        delta_exec = now - curr->exec_start;
-         * Get the amount of time the current task was running
+        if (unlikely((s64)delta_exec <= 0))
-         * since the last time we changed load (this cannot
-         * overflow on 32 bits):
-         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
-        if (!delta_exec)
                return;
-        __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
+        schedstat_set(curr->statistics.exec_max,
+                      max(delta_exec, curr->statistics.exec_max));
+        curr->sum_exec_runtime += delta_exec;
+        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        curr->vruntime += calc_delta_fair(delta_exec, curr);
+        update_min_vruntime(cfs_rq);
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
@@ -818,11 +806,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_NUMA_BALANCING
 /*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
 */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +819,835 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
-static void task_numa_placement(struct task_struct *p)
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+        unsigned long rss = 0;
+        unsigned long nr_scan_pages;
+        /*
+         * Calculations based on RSS as non-present and empty pages are skipped
+         * by the PTE scanner and NUMA hinting faults should be trapped based
+         * on resident pages
+         */
+        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+        rss = get_mm_rss(p->mm);
+        if (!rss)
+                rss = nr_scan_pages;
+        rss = round_up(rss, nr_scan_pages);
+        return rss / nr_scan_pages;
+}
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+static unsigned int task_scan_min(struct task_struct *p)
+{
+        unsigned int scan, floor;
+        unsigned int windows = 1;
+        if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+                windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+        floor = 1000 / windows;
+        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+        return max_t(unsigned int, floor, scan);
+}
+static unsigned int task_scan_max(struct task_struct *p)
+{
+        unsigned int smin = task_scan_min(p);
+        unsigned int smax;
+        /* Watch for min being lower than max due to floor calculations */
+        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+        return max(smin, smax);
+}
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running += (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+struct numa_group {
+        atomic_t refcount;
+        spinlock_t lock; /* nr_tasks, tasks */
+        int nr_tasks;
+        pid_t gid;
+        struct list_head task_list;
+        struct rcu_head rcu;
+        unsigned long total_faults;
+        unsigned long faults[0];
+};
+pid_t task_numa_group_id(struct task_struct *p)
+{
+        return p->numa_group ? p->numa_group->gid : 0;
+}
+static inline int task_faults_idx(int nid, int priv)
+{
+        return 2 * nid + priv;
+}
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+        if (!p->numa_faults)
+                return 0;
+        return p->numa_faults[task_faults_idx(nid, 0)] +
+                p->numa_faults[task_faults_idx(nid, 1)];
+}
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+        if (!p->numa_group)
+                return 0;
+        return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+}
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+        unsigned long total_faults;
+        if (!p->numa_faults)
+                return 0;
+        total_faults = p->total_numa_faults;
+        if (!total_faults)
+                return 0;
+        return 1000 * task_faults(p, nid) / total_faults;
+}
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+        if (!p->numa_group || !p->numa_group->total_faults)
+                return 0;
+        return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+        unsigned long nr_running;
+        unsigned long load;
+        /* Total compute capacity of CPUs on a node */
+        unsigned long power;
+        /* Approximate capacity in terms of runnable tasks on a node */
+        unsigned long capacity;
+        int has_capacity;
+};
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+        int cpu, cpus = 0;
+        memset(ns, 0, sizeof(*ns));
+        for_each_cpu(cpu, cpumask_of_node(nid)) {
+                struct rq *rq = cpu_rq(cpu);
+                ns->nr_running += rq->nr_running;
+                ns->load += weighted_cpuload(cpu);
+                ns->power += power_of(cpu);
+                cpus++;
+        }
+        /*
+         * If we raced with hotplug and there are no CPUs left in our mask
+         * the @ns structure is NULL'ed and task_numa_compare() will
+         * not find this node attractive.
+         *
+         * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+         * and bail there.
+         */
+        if (!cpus)
+                return;
+        ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+        ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+        ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+struct task_numa_env {
+        struct task_struct *p;
+        int src_cpu, src_nid;
+        int dst_cpu, dst_nid;
+        struct numa_stats src_stats, dst_stats;
+        int imbalance_pct, idx;
+        struct task_struct *best_task;
+        long best_imp;
+        int best_cpu;
+};
+static void task_numa_assign(struct task_numa_env *env,
+                             struct task_struct *p, long imp)
+{
+        if (env->best_task)
+                put_task_struct(env->best_task);
+        if (p)
+                get_task_struct(p);
+        env->best_task = p;
+        env->best_imp = imp;
+        env->best_cpu = env->dst_cpu;
+}
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+                              long taskimp, long groupimp)
+{
+        struct rq *src_rq = cpu_rq(env->src_cpu);
+        struct rq *dst_rq = cpu_rq(env->dst_cpu);
+        struct task_struct *cur;
+        long dst_load, src_load;
+        long load;
+        long imp = (groupimp > 0) ? groupimp : taskimp;
+        rcu_read_lock();
+        cur = ACCESS_ONCE(dst_rq->curr);
+        if (cur->pid == 0) /* idle */
+                cur = NULL;
+        /*
+         * "imp" is the fault differential for the source task between the
+         * source and destination node. Calculate the total differential for
+         * the source task and potential destination task. The more negative
+         * the value is, the more rmeote accesses that would be expected to
+         * be incurred if the tasks were swapped.
+         */
+        if (cur) {
+                /* Skip this swap candidate if cannot move to the source cpu */
+                if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                        goto unlock;
+                /*
+                 * If dst and source tasks are in the same NUMA group, or not
+                 * in any group then look only at task weights.
+                 */
+                if (cur->numa_group == env->p->numa_group) {
+                        imp = taskimp + task_weight(cur, env->src_nid) -
+                              task_weight(cur, env->dst_nid);
+                        /*
+                         * Add some hysteresis to prevent swapping the
+                         * tasks within a group over tiny differences.
+                         */
+                        if (cur->numa_group)
+                                imp -= imp/16;
+                } else {
+                        /*
+                         * Compare the group weights. If a task is all by
+                         * itself (not part of a group), use the task weight
+                         * instead.
+                         */
+                        if (env->p->numa_group)
+                                imp = groupimp;
+                        else
+                                imp = taskimp;
+                        if (cur->numa_group)
+                                imp += group_weight(cur, env->src_nid) -
+                                       group_weight(cur, env->dst_nid);
+                        else
+                                imp += task_weight(cur, env->src_nid) -
+                                       task_weight(cur, env->dst_nid);
+                }
+        }
+        if (imp < env->best_imp)
+                goto unlock;
+        if (!cur) {
+                /* Is there capacity at our destination? */
+                if (env->src_stats.has_capacity &&
+                    !env->dst_stats.has_capacity)
+                        goto unlock;
+                goto balance;
+        }
+        /* Balance doesn't matter much if we're running a task per cpu */
+        if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+                goto assign;
+        /*
+         * In the overloaded case, try and keep the load balanced.
+         */
+balance:
+        dst_load = env->dst_stats.load;
+        src_load = env->src_stats.load;
+        /* XXX missing power terms */
+        load = task_h_load(env->p);
+        dst_load += load;
+        src_load -= load;
+        if (cur) {
+                load = task_h_load(cur);
+                dst_load -= load;
+                src_load += load;
+        }
+        /* make src_load the smaller */
+        if (dst_load < src_load)
+                swap(dst_load, src_load);
+        if (src_load * env->imbalance_pct < dst_load * 100)
+                goto unlock;
+assign:
+        task_numa_assign(env, cur, imp);
+unlock:
+        rcu_read_unlock();
+}
+static void task_numa_find_cpu(struct task_numa_env *env,
+                                long taskimp, long groupimp)
 {
-        int seq;
+        int cpu;
+        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+                /* Skip this CPU if the source task cannot migrate */
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+                        continue;
+                env->dst_cpu = cpu;
+                task_numa_compare(env, taskimp, groupimp);
+        }
+}
+static int task_numa_migrate(struct task_struct *p)
+{
+        struct task_numa_env env = {
+                .p = p,
+                .src_cpu = task_cpu(p),
+                .src_nid = task_node(p),
+                .imbalance_pct = 112,
+                .best_task = NULL,
+                .best_imp = 0,
+                .best_cpu = -1
+        };
+        struct sched_domain *sd;
+        unsigned long taskweight, groupweight;
+        int nid, ret;
+        long taskimp, groupimp;
+        /*
+         * Pick the lowest SD_NUMA domain, as that would have the smallest
+         * imbalance and would be the first to start moving tasks about.
+         *
+         * And we want to avoid any moving of tasks about, as that would create
+         * random movement of tasks -- counter the numa conditions we're trying
+         * to satisfy here.
+         */
+        rcu_read_lock();
+        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+        if (sd)
+                env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+        rcu_read_unlock();
+        /*
+         * Cpusets can break the scheduler domain tree into smaller
+         * balance domains, some of which do not cross NUMA boundaries.
+         * Tasks that are "trapped" in such domains cannot be migrated
+         * elsewhere, so there is no point in (re)trying.
+         */
+        if (unlikely(!sd)) {
+                p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+                return -EINVAL;
+        }
+        taskweight = task_weight(p, env.src_nid);
+        groupweight = group_weight(p, env.src_nid);
+        update_numa_stats(&env.src_stats, env.src_nid);
+        env.dst_nid = p->numa_preferred_nid;
+        taskimp = task_weight(p, env.dst_nid) - taskweight;
+        groupimp = group_weight(p, env.dst_nid) - groupweight;
+        update_numa_stats(&env.dst_stats, env.dst_nid);
+        /* If the preferred nid has capacity, try to use it. */
+        if (env.dst_stats.has_capacity)
+                task_numa_find_cpu(&env, taskimp, groupimp);
+        /* No space available on the preferred nid. Look elsewhere. */
+        if (env.best_cpu == -1) {
+                for_each_online_node(nid) {
+                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                                continue;
+                        /* Only consider nodes where both task and groups benefit */
+                        taskimp = task_weight(p, nid) - taskweight;
+                        groupimp = group_weight(p, nid) - groupweight;
+                        if (taskimp < 0 && groupimp < 0)
+                                continue;
+                        env.dst_nid = nid;
+                        update_numa_stats(&env.dst_stats, env.dst_nid);
+                        task_numa_find_cpu(&env, taskimp, groupimp);
+                }
+        }
+        /* No better CPU than the current one was found. */
+        if (env.best_cpu == -1)
+                return -EAGAIN;
+        sched_setnuma(p, env.dst_nid);
+        /*
+         * Reset the scan period if the task is being rescheduled on an
+         * alternative node to recheck if the tasks is now properly placed.
+         */
+        p->numa_scan_period = task_scan_min(p);
+        if (env.best_task == NULL) {
+                int ret = migrate_task_to(p, env.best_cpu);
+                return ret;
+        }
+        ret = migrate_swap(p, env.best_task);
+        put_task_struct(env.best_task);
+        return ret;
+}
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+        /* This task has no NUMA fault statistics yet */
+        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+                return;
+        /* Periodically retry migrating the task to the preferred node */
+        p->numa_migrate_retry = jiffies + HZ;
+        /* Success if task is already running on preferred CPU */
+        if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+                return;
+        /* Otherwise, try migrate to a CPU on the preferred node */
+        task_numa_migrate(p);
+}
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                        unsigned long shared, unsigned long private)
+{
+        unsigned int period_slot;
+        int ratio;
+        int diff;
+        unsigned long remote = p->numa_faults_locality[0];
+        unsigned long local = p->numa_faults_locality[1];
+        /*
+         * If there were no record hinting faults then either the task is
+         * completely idle or all activity is areas that are not of interest
+         * to automatic numa balancing. Scan slower
+         */
+        if (local + shared == 0) {
+                p->numa_scan_period = min(p->numa_scan_period_max,
+                        p->numa_scan_period << 1);
+                p->mm->numa_next_scan = jiffies +
+                        msecs_to_jiffies(p->numa_scan_period);
-        if (!p->mm)     /* for example, ksmd faulting in a user's mm */
                return;
+        }
+        /*
+         * Prepare to scale scan period relative to the current period.
+         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+         */
+        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+        ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+        if (ratio >= NUMA_PERIOD_THRESHOLD) {
+                int slot = ratio - NUMA_PERIOD_THRESHOLD;
+                if (!slot)
+                        slot = 1;
+                diff = slot * period_slot;
+        } else {
+                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+                /*
+                 * Scale scan rate increases based on sharing. There is an
+                 * inverse relationship between the degree of sharing and
+                 * the adjustment made to the scanning period. Broadly
+                 * speaking the intent is that there is little point
+                 * scanning faster if shared accesses dominate as it may
+                 * simply bounce migrations uselessly
+                 */
+                period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+        }
+        p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                        task_scan_min(p), task_scan_max(p));
+        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq, nid, max_nid = -1, max_group_nid = -1;
+        unsigned long max_faults = 0, max_group_faults = 0;
+        unsigned long fault_types[2] = { 0, 0 };
+        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
+        p->numa_scan_period_max = task_scan_max(p);
+        /* If the task is part of a group prevent parallel updates to group stats */
+        if (p->numa_group) {
+                group_lock = &p->numa_group->lock;
+                spin_lock(group_lock);
+        }
+        /* Find the node with the highest number of faults */
+        for_each_online_node(nid) {
+                unsigned long faults = 0, group_faults = 0;
+                int priv, i;
+                for (priv = 0; priv < 2; priv++) {
+                        long diff;
+                        i = task_faults_idx(nid, priv);
+                        diff = -p->numa_faults[i];
+                        /* Decay existing window, copy faults since last scan */
+                        p->numa_faults[i] >>= 1;
+                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer[i];
+                        p->numa_faults_buffer[i] = 0;
+                        faults += p->numa_faults[i];
+                        diff += p->numa_faults[i];
+                        p->total_numa_faults += diff;
+                        if (p->numa_group) {
+                                /* safe because we can only change our own group */
+                                p->numa_group->faults[i] += diff;
+                                p->numa_group->total_faults += diff;
+                                group_faults += p->numa_group->faults[i];
+                        }
+                }
-        /* FIXME: Scheduling placement policy hints go here */
+                if (faults > max_faults) {
+                        max_faults = faults;
+                        max_nid = nid;
+                }
+                if (group_faults > max_group_faults) {
+                        max_group_faults = group_faults;
+                        max_group_nid = nid;
+                }
+        }
+        update_task_scan_period(p, fault_types[0], fault_types[1]);
+        if (p->numa_group) {
+                /*
+                 * If the preferred task and group nids are different,
+                 * iterate over the nodes again to find the best place.
+                 */
+                if (max_nid != max_group_nid) {
+                        unsigned long weight, max_weight = 0;
+                        for_each_online_node(nid) {
+                                weight = task_weight(p, nid) + group_weight(p, nid);
+                                if (weight > max_weight) {
+                                        max_weight = weight;
+                                        max_nid = nid;
+                                }
+                        }
+                }
+                spin_unlock(group_lock);
+        }
+        /* Preferred node as the node with the most faults */
+        if (max_faults && max_nid != p->numa_preferred_nid) {
+                /* Update the preferred nid and migrate task if possible */
+                sched_setnuma(p, max_nid);
+                numa_migrate_preferred(p);
+        }
+}
+static inline int get_numa_group(struct numa_group *grp)
+{
+        return atomic_inc_not_zero(&grp->refcount);
+}
+static inline void put_numa_group(struct numa_group *grp)
+{
+        if (atomic_dec_and_test(&grp->refcount))
+                kfree_rcu(grp, rcu);
+}
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                        int *priv)
+{
+        struct numa_group *grp, *my_grp;
+        struct task_struct *tsk;
+        bool join = false;
+        int cpu = cpupid_to_cpu(cpupid);
+        int i;
+        if (unlikely(!p->numa_group)) {
+                unsigned int size = sizeof(struct numa_group) +
+                                    2*nr_node_ids*sizeof(unsigned long);
+                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+                if (!grp)
+                        return;
+                atomic_set(&grp->refcount, 1);
+                spin_lock_init(&grp->lock);
+                INIT_LIST_HEAD(&grp->task_list);
+                grp->gid = p->pid;
+                for (i = 0; i < 2*nr_node_ids; i++)
+                        grp->faults[i] = p->numa_faults[i];
+                grp->total_faults = p->total_numa_faults;
+                list_add(&p->numa_entry, &grp->task_list);
+                grp->nr_tasks++;
+                rcu_assign_pointer(p->numa_group, grp);
+        }
+        rcu_read_lock();
+        tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+        if (!cpupid_match_pid(tsk, cpupid))
+                goto no_join;
+        grp = rcu_dereference(tsk->numa_group);
+        if (!grp)
+                goto no_join;
+        my_grp = p->numa_group;
+        if (grp == my_grp)
+                goto no_join;
+        /*
+         * Only join the other group if its bigger; if we're the bigger group,
+         * the other task will join us.
+         */
+        if (my_grp->nr_tasks > grp->nr_tasks)
+                goto no_join;
+        /*
+         * Tie-break on the grp address.
+         */
+        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+                goto no_join;
+        /* Always join threads in the same process. */
+        if (tsk->mm == current->mm)
+                join = true;
+        /* Simple filter to avoid false positives due to PID collisions */
+        if (flags & TNF_SHARED)
+                join = true;
+        /* Update priv based on whether false sharing was detected */
+        *priv = !join;
+        if (join && !get_numa_group(grp))
+                goto no_join;
+        rcu_read_unlock();
+        if (!join)
+                return;
+        double_lock(&my_grp->lock, &grp->lock);
+        for (i = 0; i < 2*nr_node_ids; i++) {
+                my_grp->faults[i] -= p->numa_faults[i];
+                grp->faults[i] += p->numa_faults[i];
+        }
+        my_grp->total_faults -= p->total_numa_faults;
+        grp->total_faults += p->total_numa_faults;
+        list_move(&p->numa_entry, &grp->task_list);
+        my_grp->nr_tasks--;
+        grp->nr_tasks++;
+        spin_unlock(&my_grp->lock);
+        spin_unlock(&grp->lock);
+        rcu_assign_pointer(p->numa_group, grp);
+        put_numa_group(my_grp);
+        return;
+no_join:
+        rcu_read_unlock();
+        return;
+}
+void task_numa_free(struct task_struct *p)
+{
+        struct numa_group *grp = p->numa_group;
+        int i;
+        void *numa_faults = p->numa_faults;
+        if (grp) {
+                spin_lock(&grp->lock);
+                for (i = 0; i < 2*nr_node_ids; i++)
+                        grp->faults[i] -= p->numa_faults[i];
+                grp->total_faults -= p->total_numa_faults;
+                list_del(&p->numa_entry);
+                grp->nr_tasks--;
+                spin_unlock(&grp->lock);
+                rcu_assign_pointer(p->numa_group, NULL);
+                put_numa_group(grp);
+        }
+        p->numa_faults = NULL;
+        p->numa_faults_buffer = NULL;
+        kfree(numa_faults);
 }
 /*
 * Got a PROT_NONE fault for a page on @node.
 */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 {
        struct task_struct *p = current;
+        bool migrated = flags & TNF_MIGRATED;
+        int priv;
        if (!numabalancing_enabled)
                return;
-        /* FIXME: Allocate task-specific structure for placement policy here */
+        /* for example, ksmd faulting in a user's mm */
+        if (!p->mm)
+                return;
+        /* Do not worry about placement if exiting */
+        if (p->state == TASK_DEAD)
+                return;
+        /* Allocate buffer to track faults on a per-node basis */
+        if (unlikely(!p->numa_faults)) {
+                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+                /* numa_faults and numa_faults_buffer share the allocation */
+                p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+                if (!p->numa_faults)
+                        return;
+                BUG_ON(p->numa_faults_buffer);
+                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+                p->total_numa_faults = 0;
+                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+        }
        /*
-         * If pages are properly placed (did not migrate) then scan slower.
+         * First accesses are treated as private, otherwise consider accesses
-         * This is reset periodically in case of phase changes
+         * to be private if the accessing pid has not changed
         */
-        if (!migrated)
+        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
-                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                priv = 1;
-                        p->numa_scan_period + jiffies_to_msecs(10));
+        } else {
+                priv = cpupid_match_pid(p, last_cpupid);
+                if (!priv && !(flags & TNF_NO_GROUP))
+                        task_numa_group(p, last_cpupid, flags, &priv);
+        }
        task_numa_placement(p);
+        /*
+         * Retry task to preferred node migration periodically, in case it
+         * case it previously failed, or the scheduler moved us.
+         */
+        if (time_after(jiffies, p->numa_migrate_retry))
+                numa_migrate_preferred(p);
+        if (migrated)
+                p->numa_pages_migrated += pages;
+        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1667,7 @@ void task_numa_work(struct callback_head *work)
        struct mm_struct *mm = p->mm;
        struct vm_area_struct *vma;
        unsigned long start, end;
+        unsigned long nr_pte_updates = 0;
        long pages;
        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1684,9 @@ void task_numa_work(struct callback_head *work)
        if (p->flags & PF_EXITING)
                return;
-        /*
+        if (!mm->numa_next_scan) {
-         * We do not care about task placement until a task runs on a node
+                mm->numa_next_scan = now +
-         * other than the first one used by the address space. This is
+                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-         * largely because migrations are driven by what CPU the task
-         * is running on. If it's never scheduled on another node, it'll
-         * not migrate so why bother trapping the fault.
-         */
-        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-                mm->first_nid = numa_node_id();
-        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-                /* Are we running on a new node yet? */
-                if (numa_node_id() == mm->first_nid &&
-                    !sched_feat_numa(NUMA_FORCE))
-                        return;
-                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-        }
-        /*
-         * Reset the scan period if enough time has gone by. Objective is that
-         * scanning will be reduced if pages are properly placed. As tasks
-         * can enter different phases this needs to be re-examined. Lacking
-         * proper tracking of reference behaviour, this blunt hammer is used.
-         */
-        migrate = mm->numa_next_reset;
-        if (time_after(now, migrate)) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-                xchg(&mm->numa_next_reset, next_scan);
        }
        /*
@@ -938,20 +1696,20 @@ void task_numa_work(struct callback_head *work)
        if (time_before(now, migrate))
                return;
-        if (p->numa_scan_period == 0)
+        if (p->numa_scan_period == 0) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                p->numa_scan_period_max = task_scan_max(p);
+                p->numa_scan_period = task_scan_min(p);
+        }
        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                return;
        /*
-         * Do not set pte_numa if the current running node is rate-limited.
+         * Delay this task enough that another task of this mm will likely win
-         * This loses statistics on the fault but if we are unwilling to
+         * the next time around.
-         * migrate to this node, it is less likely we can do useful work
         */
-        if (migrate_ratelimited(numa_node_id()))
+        p->node_stamp += 2 * TICK_NSEC;
-                return;
        start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1725,39 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma))
+                if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
                        continue;
-                /* Skip small VMAs. They are not likely to be of relevance */
+                /*
-                if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+                 * Shared library pages mapped by multiple processes are not
+                 * migrated as it is expected they are cache replicated. Avoid
+                 * hinting faults in read-only file-backed mappings or the vdso
+                 * as migrating the pages will be of marginal benefit.
+                 */
+                if (!vma->vm_mm ||
+                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+                        continue;
+                /*
+                 * Skip inaccessible VMAs to avoid any confusion between
+                 * PROT_NONE and NUMA hinting ptes
+                 */
+                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                        continue;
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                        end = min(end, vma->vm_end);
-                        pages -= change_prot_numa(vma, start, end);
+                        nr_pte_updates += change_prot_numa(vma, start, end);
+                        /*
+                         * Scan sysctl_numa_balancing_scan_size but ensure that
+                         * at least one PTE is updated so that unused virtual
+                         * address space is quickly skipped.
+                         */
+                        if (nr_pte_updates)
+                                pages -= (end - start) >> PAGE_SHIFT;
                        start = end;
                        if (pages <= 0)
@@ -988,10 +1767,10 @@ void task_numa_work(struct callback_head *work)
 out:
        /*
-         * It is possible to reach the end of the VMA list but the last few VMAs are
+         * It is possible to reach the end of the VMA list but the last few
-         * not guaranteed to the vma_migratable. If they are not, we would find the
+         * VMAs are not guaranteed to the vma_migratable. If they are not, we
-         * !migratable VMA on the next scan but not reset the scanner to the start
+         * would find the !migratable VMA on the next scan but not reset the
-         * so check it now.
+         * scanner to the start so check it now.
         */
        if (vma)
                mm->numa_scan_offset = start;
@@ -1025,8 +1804,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
        if (now - curr->node_stamp > period) {
                if (!curr->node_stamp)
-                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                        curr->numa_scan_period = task_scan_min(curr);
-                curr->node_stamp = now;
+                curr->node_stamp += period;
                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1817,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 static void
@@ -1047,8 +1834,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
-                list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+                struct rq *rq = rq_of(cfs_rq);
+                account_numa_enqueue(rq, task_of(se));
+                list_add(&se->group_node, &rq->cfs_tasks);
+        }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1059,8 +1850,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
+                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+        }
        cfs_rq->nr_running--;
 }
@@ -1378,7 +2171,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
        long contrib;
        /* The fraction of a cpu used by this cfs_rq */
-        contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+        contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
                          sa->runnable_avg_period + 1);
        contrib -= cfs_rq->tg_runnable_contrib;
@@ -2070,13 +2863,14 @@ static inline bool cfs_bandwidth_used(void)
        return static_key_false(&__cfs_bandwidth_used);
 }
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
 {
-        /* only need to count groups transitioning between enabled/!enabled */
+        static_key_slow_inc(&__cfs_bandwidth_used);
-        if (enabled && !was_enabled)
+}
-                static_key_slow_inc(&__cfs_bandwidth_used);
-        else if (!enabled && was_enabled)
+void cfs_bandwidth_usage_dec(void)
-                static_key_slow_dec(&__cfs_bandwidth_used);
+{
+        static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -2084,7 +2878,8 @@ static bool cfs_bandwidth_used(void)
        return true;
 }
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
 #endif /* HAVE_JUMP_LABEL */
 /*
@@ -2213,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        }
 }
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
-                                     unsigned long delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
@@ -2232,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -2335,6 +3129,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        if (!cfs_b->timer_active)
+                __start_cfs_bandwidth(cfs_b);
        raw_spin_unlock(&cfs_b->lock);
 }
@@ -2448,6 +3244,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        if (idle)
                goto out_unlock;
+        /*
+         * if we have relooped after returning idle once, we need to update our
+         * status as actually running, so that other cpus doing
+         * __start_cfs_bandwidth will stop trying to cancel us.
+         */
+        cfs_b->timer_active = 1;
        __refill_cfs_bandwidth_runtime(cfs_b);
        if (!throttled) {
@@ -2508,7 +3311,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 /* how long we wait to gather additional slack before distributing */
 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 {
        struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3393,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        u64 expires;
        /* confirm we're still not at a refresh boundary */
-        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+        raw_spin_lock(&cfs_b->lock);
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+                raw_spin_unlock(&cfs_b->lock);
                return;
+        }
-        raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
                runtime = cfs_b->runtime;
                cfs_b->runtime = 0;
@@ -2708,11 +3519,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         * (timer_active==0 becomes visible before the hrtimer call-back
         * terminates).  In either case we ensure that it's re-programmed
         */
-        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+        while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+               hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+                /* bounce the lock to allow do_sched_cfs_period_timer to run */
                raw_spin_unlock(&cfs_b->lock);
-                /* ensure cfs_b->lock is available while we wait */
+                cpu_relax();
-                hrtimer_cancel(&cfs_b->period_timer);
                raw_spin_lock(&cfs_b->lock);
                /* if someone else restarted the timer then we're done */
                if (cfs_b->timer_active)
@@ -2755,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        return rq_clock_task(rq_of(cfs_rq));
 }
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3166,8 +3976,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 }
 #else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                unsigned long wl, unsigned long wg)
 {
        return wl;
 }
@@ -3420,11 +4229,10 @@ done:
 * preempt must be disabled.
 */
 static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
        int new_cpu = cpu;
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4712,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED 0x08
 struct lb_env {
        struct sched_domain     *sd;
@@ -3929,6 +4740,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+        enum fbq_type           fbq_type;
 };
 /*
@@ -3975,6 +4788,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+            !(env->sd->flags & SD_NUMA)) {
+                return false;
+        }
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid)
+                return false;
+        /* Always encourage migration to the preferred node. */
+        if (dst_nid == p->numa_preferred_nid)
+                return true;
+        /* If both task and group weight improve, this move is a winner. */
+        if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+            group_weight(p, dst_nid) > group_weight(p, src_nid))
+                return true;
+        return false;
+}
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+                return false;
+        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+                return false;
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid)
+                return false;
+        /* Migrating away from the preferred node is always bad. */
+        if (src_nid == p->numa_preferred_nid)
+                return true;
+        /* If either task or group weight get worse, don't do it. */
+        if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+            group_weight(p, dst_nid) < group_weight(p, src_nid))
+                return true;
+        return false;
+}
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+#endif
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
@@ -3997,6 +4882,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                env->flags |= LBF_SOME_PINNED;
                /*
                 * Remember if this task can be migrated to any other cpu in
                 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4892,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                 * Also avoid computing new_dst_cpu if we have already computed
                 * one in current iteration.
                 */
-                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                        return 0;
                /* Prevent to re-select dst_cpu via env's cpus */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                                env->flags |= LBF_SOME_PINNED;
+                                env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
                        }
@@ -4030,11 +4917,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        /*
         * Aggressive migration if:
-         * 1) task is cache cold, or
+         * 1) destination numa is preferred
-         * 2) too many balance attempts have failed.
+         * 2) task is cache cold, or
+         * 3) too many balance attempts have failed.
         */
        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+        if (!tsk_cache_hot)
+                tsk_cache_hot = migrate_degrades_locality(p, env);
+        if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -4077,8 +4977,6 @@ static int move_one_task(struct lb_env *env)
        return 0;
 }
-static unsigned long task_h_load(struct task_struct *p);
 static const unsigned int sched_nr_migrate_break = 32;
 /*
@@ -4291,6 +5189,10 @@ struct sg_lb_stats {
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
 };
 /*
@@ -4330,7 +5232,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 /**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
 *
 * Return: The load index.
 */
@@ -4447,7 +5349,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
+        unsigned long power, power_orig;
        unsigned long interval;
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5361,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                return;
        }
-        power = 0;
+        power_orig = power = 0;
        if (child->flags & SD_OVERLAP) {
                /*
@@ -4467,8 +5369,33 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 * span the current group.
                 */
-                for_each_cpu(cpu, sched_group_cpus(sdg))
+                for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                        power += power_of(cpu);
+                        struct sched_group_power *sgp;
+                        struct rq *rq = cpu_rq(cpu);
+                        /*
+                         * build_sched_domains() -> init_sched_groups_power()
+                         * gets here before we've attached the domains to the
+                         * runqueues.
+                         *
+                         * Use power_of(), which is set irrespective of domains
+                         * in update_cpu_power().
+                         *
+                         * This avoids power/power_orig from being 0 and
+                         * causing divide-by-zero issues on boot.
+                         *
+                         * Runtime updates will correct power_orig.
+                         */
+                        if (unlikely(!rq->sd)) {
+                                power_orig += power_of(cpu);
+                                power += power_of(cpu);
+                                continue;
+                        }
+                        sgp = rq->sd->groups->sgp;
+                        power_orig += sgp->power_orig;
+                        power += sgp->power;
+                }
        } else  {
                /*
                 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5404,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
                group = child->groups;
                do {
+                        power_orig += group->sgp->power_orig;
                        power += group->sgp->power;
                        group = group->next;
                } while (group != child->groups);
        }
-        sdg->sgp->power_orig = sdg->sgp->power = power;
+        sdg->sgp->power_orig = power_orig;
+        sdg->sgp->power = power;
 }
 /*
@@ -4526,13 +5455,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * cpu 3 and leave one of the cpus in the second group unused.
 *
 * The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
+ * by noticing the lower domain failed to reach balance and had difficulty
- * idle -- or rather, there's a distinct imbalance in the cpus; see
+ * moving tasks due to affinity constraints.
- * sg_imbalanced().
 *
 * When this is so detected; this group becomes a candidate for busiest; see
- * update_sd_pick_busiest(). And calculcate_imbalance() and
+ * update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
 * to create an effective group imbalance.
 *
 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5468,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * subtle and fragile situation.
 */
-struct sg_imb_stats {
+static inline int sg_imbalanced(struct sched_group *group)
-        unsigned long max_nr_running, min_nr_running;
-        unsigned long max_cpu_load, min_cpu_load;
-};
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
 {
-        sgi->max_cpu_load = sgi->max_nr_running = 0UL;
+        return group->sgp->imbalance;
-        sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
 }
-static inline void
+/*
-update_sg_imb_stats(struct sg_imb_stats *sgi,
+ * Compute the group capacity.
-                    unsigned long load, unsigned long nr_running)
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
 {
-        if (load > sgi->max_cpu_load)
+        unsigned int capacity, smt, cpus;
-                sgi->max_cpu_load = load;
+        unsigned int power, power_orig;
-        if (sgi->min_cpu_load > load)
-                sgi->min_cpu_load = load;
-        if (nr_running > sgi->max_nr_running)
+        power = group->sgp->power;
-                sgi->max_nr_running = nr_running;
+        power_orig = group->sgp->power_orig;
-        if (sgi->min_nr_running > nr_running)
+        cpus = group->group_weight;
-                sgi->min_nr_running = nr_running;
-}
-static inline int
+        /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
+        smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
-{
+        capacity = cpus / smt; /* cores */
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of a task.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
-            (sgi->max_nr_running - sgi->min_nr_running) > 1)
-                return 1;
-        return 0;
+        capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+        if (!capacity)
+                capacity = fix_small_capacity(env->sd, group);
+        return capacity;
 }
 /**
@@ -4597,12 +5512,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs)
 {
-        struct sg_imb_stats sgi;
        unsigned long nr_running;
        unsigned long load;
        int i;
-        init_sg_imb_stats(&sgi);
+        memset(sgs, 0, sizeof(*sgs));
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
@@ -4610,24 +5524,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
+                if (local_group)
                        load = target_load(i, load_idx);
-                } else {
+                else
                        load = source_load(i, load_idx);
-                        update_sg_imb_stats(&sgi, load, nr_running);
-                }
                sgs->group_load += load;
                sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+                sgs->nr_numa_running += rq->nr_numa_running;
+                sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
        }
-        if (local_group && (env->idle != CPU_NEWLY_IDLE ||
-                        time_after_eq(jiffies, group->sgp->next_update)))
-                update_group_power(env->sd, env->dst_cpu);
        /* Adjust by relative CPU power of the group */
        sgs->group_power = group->sgp->power;
        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5547,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        if (sgs->sum_nr_running)
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        sgs->group_imb = sg_imbalanced(sgs, &sgi);
-        sgs->group_capacity =
-                DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
-        if (!sgs->group_capacity)
-                sgs->group_capacity = fix_small_capacity(env->sd, group);
        sgs->group_weight = group->group_weight;
+        sgs->group_imb = sg_imbalanced(group);
+        sgs->group_capacity = sg_capacity(env, group);
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
 }
@@ -4693,14 +5600,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running > sgs->nr_numa_running)
+                return regular;
+        if (sgs->sum_nr_running > sgs->nr_preferred_running)
+                return remote;
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        if (rq->nr_running > rq->nr_numa_running)
+                return regular;
+        if (rq->nr_running > rq->nr_preferred_running)
+                return remote;
+        return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
- * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
-static inline void update_sd_lb_stats(struct lb_env *env,
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
-                                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5655,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                if (local_group) {
                        sds->local = sg;
                        sgs = &sds->local_stat;
+                        if (env->idle != CPU_NEWLY_IDLE ||
+                            time_after_eq(jiffies, sg->sgp->next_update))
+                                update_group_power(env->sd, env->dst_cpu);
                }
-                memset(sgs, 0, sizeof(*sgs));
                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+                if (local_group)
+                        goto next_group;
                /*
                 * In case the child domain prefers tasks go to siblings
                 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5676,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 * heaviest group when it is already under-utilized (possible
                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling && !local_group &&
+                if (prefer_sibling && sds->local &&
-                                sds->local && sds->local_stat.group_has_capacity)
+                    sds->local_stat.group_has_capacity)
                        sgs->group_capacity = min(sgs->group_capacity, 1U);
-                /* Now, start updating sd_lb_stats */
+                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
-                sds->total_load += sgs->group_load;
-                sds->total_pwr += sgs->group_power;
-                if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
                        sds->busiest = sg;
                        sds->busiest_stat = *sgs;
                }
+next_group:
+                /* Now, start updating sd_lb_stats */
+                sds->total_load += sgs->group_load;
+                sds->total_pwr += sgs->group_power;
                sg = sg->next;
        } while (sg != env->sd->groups);
+        if (env->sd->flags & SD_NUMA)
+                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 /**
@@ -5053,15 +5998,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long power = power_of(i);
+                unsigned long power, capacity, wl;
-                unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                enum fbq_type rt;
-                                                           SCHED_POWER_SCALE);
-                unsigned long wl;
+                rq = cpu_rq(i);
+                rt = fbq_classify_rq(rq);
+                /*
+                 * We classify groups/runqueues into three groups:
+                 *  - regular: there are !numa tasks
+                 *  - remote:  there are numa tasks that run on the 'wrong' node
+                 *  - all:     there is no distinction
+                 *
+                 * In order to avoid migrating ideally placed numa tasks,
+                 * ignore those when there's better options.
+                 *
+                 * If we ignore the actual busiest queue to migrate another
+                 * task, the next balance pass can still reduce the busiest
+                 * queue by moving tasks around inside the node.
+                 *
+                 * If we cannot move enough load due to this classification
+                 * the next pass will adjust the group classification and
+                 * allow migration of more tasks.
+                 *
+                 * Both cases only affect the total convergence complexity.
+                 */
+                if (rt > env->fbq_type)
+                        continue;
+                power = power_of(i);
+                capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                rq = cpu_rq(i);
                wl = weighted_cpuload(i);
                /*
@@ -5164,6 +6133,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        int *continue_balancing)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
+        struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -5177,6 +6147,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+                .fbq_type       = all,
        };
        /*
@@ -5268,17 +6239,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+                        /* Prevent to re-select dst_cpu via env's cpus */
+                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
-                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.flags       &= ~LBF_DST_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
-                        /* Prevent to re-select dst_cpu via env's cpus */
-                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
@@ -5286,6 +6257,18 @@ more_balance:
                        goto more_balance;
                }
+                /*
+                 * We failed to reach balance because of affinity.
+                 */
+                if (sd_parent) {
+                        int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                                *group_imbalance = 1;
+                        } else if (*group_imbalance)
+                                *group_imbalance = 0;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6376,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
+        u64 curr_cost = 0;
        this_rq->idle_stamp = rq_clock(this_rq);
@@ -5409,15 +6393,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int continue_balancing = 1;
+                u64 t0, domain_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
+                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                        break;
                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        t0 = sched_clock_cpu(this_cpu);
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
+                        domain_cost = sched_clock_cpu(this_cpu) - t0;
+                        if (domain_cost > sd->max_newidle_lb_cost)
+                                sd->max_newidle_lb_cost = domain_cost;
+                        curr_cost += domain_cost;
                }
                interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6435,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+        if (curr_cost > this_rq->max_idle_balance_cost)
+                this_rq->max_idle_balance_cost = curr_cost;
 }
 /*
@@ -5572,16 +6571,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
        struct sched_domain *sd;
+        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (!sd || !sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 0;
-        for (; sd; sd = sd->parent)
+        atomic_inc(&sd->groups->sgp->nr_busy_cpus);
-                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -5589,16 +6588,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
        struct sched_domain *sd;
+        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (!sd || sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 1;
-        for (; sd; sd = sd->parent)
+        atomic_dec(&sd->groups->sgp->nr_busy_cpus);
-                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -5662,15 +6661,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
-        int need_serialize;
+        int need_serialize, need_decay = 0;
+        u64 max_cost = 0;
        update_blocked_averages(cpu);
        rcu_read_lock();
        for_each_domain(cpu, sd) {
+                /*
+                 * Decay the newidle max times here because this is a regular
+                 * visit to all the domains. Decay ~1% per second.
+                 */
+                if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                        sd->max_newidle_lb_cost =
+                                (sd->max_newidle_lb_cost * 253) / 256;
+                        sd->next_decay_max_lb_cost = jiffies + HZ;
+                        need_decay = 1;
+                }
+                max_cost += sd->max_newidle_lb_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!continue_balancing) {
+                        if (need_decay)
+                                continue;
+                        break;
+                }
                interval = sd->balance_interval;
                if (idle != CPU_IDLE)
                        interval *= sd->busy_factor;
@@ -5689,7 +6712,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                /*
-                                 * The LBF_SOME_PINNED logic could have changed
+                                 * The LBF_DST_PINNED logic could have changed
                                 * env->dst_cpu, so we can't know our idle
                                 * state even if we migrated tasks. Update it.
                                 */
@@ -5704,14 +6727,14 @@ out:
                        next_balance = sd->last_balance + interval;
                        update_next_balance = 1;
                }
+        }
+        if (need_decay) {
                /*
-                 * Stop the load balance at this level. There is another
+                 * Ensure the rq-wide value also decays but keep it at a
-                 * CPU in our sched group which is doing load balancing more
+                 * reasonable floor to avoid funnies with rq->avg_idle.
-                 * actively.
                 */
-                if (!continue_balancing)
+                rq->max_idle_balance_cost =
-                        break;
+                        max((u64)sysctl_sched_migration_cost, max_cost);
        }
        rcu_read_unlock();
@@ -5781,6 +6804,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
+        struct sched_group_power *sgp;
+        int nr_busy;
        if (unlikely(idle_cpu(cpu)))
                return 0;
@@ -5806,22 +6831,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
                goto need_kick;
        rcu_read_lock();
-        for_each_domain(cpu, sd) {
+        sd = rcu_dereference(per_cpu(sd_busy, cpu));
-                struct sched_group *sg = sd->groups;
-                struct sched_group_power *sgp = sg->sgp;
-                int nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
+        if (sd) {
-                        goto need_kick_unlock;
+                sgp = sd->groups->sgp;
+                nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+                if (nr_busy > 1)
-                    && (cpumask_first_and(nohz.idle_cpus_mask,
-                                          sched_domain_span(sd)) < cpu))
                        goto need_kick_unlock;
-                if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
-                        break;
        }
+        sd = rcu_dereference(per_cpu(sd_asym, cpu));
+        if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+                                  sched_domain_span(sd)) < cpu))
+                goto need_kick_unlock;
        rcu_read_unlock();
        return 0;
@@ -6214,7 +7239,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        update_load_set(&se->load, 0);
+        /* guarantee group entities always have weight */
+        update_load_set(&se->load, NICE_0_LOAD);
        se->parent = parent;
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
 /*
 * Apply the automatic NUMA scheduling policy. Enabled automatically
 * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * numa_balancing=
- * for debugging the core machinery.
 */
 #ifdef CONFIG_NUMA_BALANCING
 SCHED_FEAT(NUMA,        false)
-SCHED_FEAT(NUMA_FORCE,  false)
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
         * if we should look at the mask. It would be a shame
         * if we looked at the mask, but the mask was not
         * updated yet.
+         *
+         * Matched by the barrier in pull_rt_task().
         */
-        wmb();
+        smp_wmb();
        atomic_inc(&rq->rd->rto_count);
 }
@@ -899,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Change rq's cpupri only if rt_rq is the top queue.
+         */
+        if (&rq->rt != rt_rq)
+                return;
+#endif
        if (rq->online && prio < prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
@@ -908,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Change rq's cpupri only if rt_rq is the top queue.
+         */
+        if (&rq->rt != rt_rq)
+                return;
+#endif
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -1169,13 +1185,10 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        struct task_struct *curr;
        struct rq *rq;
-        int cpu;
-        cpu = task_cpu(p);
        if (p->nr_cpus_allowed == 1)
                goto out;
@@ -1213,8 +1226,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         */
        if (curr && unlikely(rt_task(curr)) &&
            (curr->nr_cpus_allowed < 2 ||
-             curr->prio <= p->prio) &&
+             curr->prio <= p->prio)) {
-            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1630,6 +1642,12 @@ static int pull_rt_task(struct rq *this_rq)
        if (likely(!rt_overloaded(this_rq)))
                return 0;
+        /*
+         * Match the barrier from rt_set_overloaded; this guarantees that if we
+         * see overloaded we must also see the rto_mask bit.
+         */
+        smp_rmb();
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
@@ -1931,8 +1949,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = sched_rr_timeslice;
        /*
-         * Requeue to the end of queue if we (and all of our ancestors) are the
+         * Requeue to the end of queue if we (and all of our ancestors) are not
-         * only element on the queue
+         * the only element on the queue
         */
        for_each_sched_rt_entity(rt_se) {
                if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..88c85b21d633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
 #include <linux/tick.h>
+#include <linux/slab.h>
 #include "cpupri.h"
 #include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
         * remote CPUs use both these fields when doing load calculation.
         */
        unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
        unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
        u64 age_stamp;
        u64 idle_stamp;
        u64 avg_idle;
+        /* This is used to determine avg_idle's max value */
+        u64 max_idle_balance_cost;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
        return rq->clock_task;
 }
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
 #ifdef CONFIG_SMP
 #define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
        return hsd;
 }
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd) {
+                if (sd->flags & flag)
+                        break;
+        }
+        return sd;
+}
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_power {
        atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
         */
        unsigned int power, power_orig;
        unsigned long next_update;
+        int imbalance; /* XXX unrelated to power but shared group state */
        /*
         * Number of busy cpus in this group.
         */
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         */
        smp_wmb();
        task_thread_info(p)->cpu = cpu;
+        p->wake_cpu = cpu;
 #endif
 }
@@ -974,7 +1005,7 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
-        int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        spin_lock(l1);
+        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        raw_spin_lock(l1);
+        raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
 /*
 * double_rq_lock - safely lock two runqueues
 *
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c7edee71bce8..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
 */
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+        unsigned long long now = rq_clock(rq), delta = 0;
        if (unlikely(sched_info_on()))
                if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
        sched_info_reset_dequeued(t);
        t->sched_info.run_delay += delta;
-        rq_sched_info_dequeued(task_rq(t), delta);
+        rq_sched_info_dequeued(rq, delta);
 }
 /*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+        unsigned long long now = rq_clock(rq), delta = 0;
        if (t->sched_info.last_queued)
                delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
        t->sched_info.last_arrival = now;
        t->sched_info.pcount++;
-        rq_sched_info_arrive(task_rq(t), delta);
+        rq_sched_info_arrive(rq, delta);
 }
 /*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.
 */
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
 {
        if (unlikely(sched_info_on()))
                if (!t->sched_info.last_queued)
-                        t->sched_info.last_queued = rq_clock(task_rq(t));
+                        t->sched_info.last_queued = rq_clock(rq);
 }
 /*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
 * sched_info_queued() to mark that it has now again started waiting on
 * the runqueue.
 */
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long delta = rq_clock(task_rq(t)) -
+        unsigned long long delta = rq_clock(rq) -
                                        t->sched_info.last_arrival;
-        rq_sched_info_depart(task_rq(t), delta);
+        rq_sched_info_depart(rq, delta);
        if (t->state == TASK_RUNNING)
-                sched_info_queued(t);
+                sched_info_queued(rq, t);
 }
 /*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
 * the idle task.)  We are only called when prev != next.
 */
 static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+                    struct task_struct *prev, struct task_struct *next)
 {
-        struct rq *rq = task_rq(prev);
        /*
         * prev now departs the cpu.  It's not interesting to record
         * stats about how efficient we were at scheduling the idle
         * process, however.
         */
        if (prev != rq->idle)
-                sched_info_depart(prev);
+                sched_info_depart(rq, prev);
        if (next != rq->idle)
-                sched_info_arrive(next);
+                sched_info_arrive(rq, next);
 }
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+                  struct task_struct *prev, struct task_struct *next)
 {
        if (unlikely(sched_info_on()))
-                __sched_info_switch(prev, next);
+                __sched_info_switch(rq, prev, next);
 }
 #else
-#define sched_info_queued(t)                    do { } while (0)
+#define sched_info_queued(rq, t)                do { } while (0)
 #define sched_info_reset_dequeued(t)    do { } while (0)
-#define sched_info_dequeued(t)                  do { } while (0)
+#define sched_info_dequeued(rq, t)              do { } while (0)
-#define sched_info_switch(t, next)              do { } while (0)
+#define sched_info_depart(rq, t)                do { } while (0)
+#define sched_info_arrive(rq, next)             do { } while (0)
+#define sched_info_switch(rq, t, next)          do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 /*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* stop tasks as never migrate */
 }
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index d550920e040c..7d50f794e248 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
 /*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+                        int nr_exclusive, int wake_flags, void *key)
+{
+        wait_queue_t *curr, *next;
+        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+                unsigned flags = curr->flags;
+                if (curr->func(curr, mode, wake_flags, key) &&
+                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                        break;
+        }
+}
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+                        int nr_exclusive, void *key)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&q->lock, flags);
+        __wake_up_common(q, mode, nr_exclusive, 0, key);
+        spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+        __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+        __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+                        int nr_exclusive, void *key)
+{
+        unsigned long flags;
+        int wake_flags = 1; /* XXX WF_SYNC */
+        if (unlikely(!q))
+                return;
+        if (unlikely(nr_exclusive != 1))
+                wake_flags = 0;
+        spin_lock_irqsave(&q->lock, flags);
+        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+        spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
+/*
 * Note: we use "set_current_state()" _after_ the wait-queue add,
 * because we need a memory barrier there on SMP, so that any
 * wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+        unsigned long flags;
+        if (signal_pending_state(state, current))
+                return -ERESTARTSYS;
+        wait->private = current;
+        wait->func = autoremove_wake_function;
+        spin_lock_irqsave(&q->lock, flags);
+        if (list_empty(&wait->task_list)) {
+                if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                        __add_wait_queue_tail(q, wait);
+                else
+                        __add_wait_queue(q, wait);
+        }
+        set_current_state(state);
+        spin_unlock_irqrestore(&q->lock, flags);
+        return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
 /**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on
diff --git a/kernel/signal.c b/kernel/signal.c
index ded28b91fa53..940b30ee9a30 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
-int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 {
        int err;
diff --git a/kernel/smp.c b/kernel/smp.c
index 0564571dcdf7..bd9f94028838 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -15,9 +15,9 @@
 #include "smpboot.h"
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 enum {
        CSD_FLAG_LOCK           = 0x01,
+        CSD_FLAG_WAIT           = 0x02,
 };
 struct call_function_data {
@@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd)
 static void csd_unlock(struct call_single_data *csd)
 {
-        WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
+        WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
        /*
         * ensure we're all done before releasing data:
@@ -139,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd)
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
-static
+static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
-void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 {
        struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
        unsigned long flags;
        int ipi;
+        if (wait)
+                csd->flags |= CSD_FLAG_WAIT;
        raw_spin_lock_irqsave(&dst->lock, flags);
        ipi = list_empty(&dst->list);
        list_add_tail(&csd->list, &dst->list);
@@ -340,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
        }
        put_cpu();
 }
+EXPORT_SYMBOL_GPL(__smp_call_function_single);
 /**
 * smp_call_function_many(): Run a function on a set of other CPUs.
@@ -459,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
        return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-#endif /* USE_GENERIC_SMP_HELPERS */
 /* Setup configured maximum number of CPUs to activate */
 unsigned int setup_max_cpus = NR_CPUS;
@@ -524,6 +526,11 @@ void __init setup_nr_cpu_ids(void)
        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
 }
+void __weak smp_announce(void)
+{
+        printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
+}
 /* Called by boot processor to activate the rest. */
 void __init smp_init(void)
 {
@@ -540,7 +547,7 @@ void __init smp_init(void)
        }
        /* Any cleanup work */
-        printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+        smp_announce();
        smp_cpus_done(setup_max_cpus);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7d498d8cc4f..11025ccc06dd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,8 +6,6 @@
 *      Distribute under GPLv2.
 *
 *      Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
- *
- *      Remote softirq infrastructure is by Jens Axboe.
 */
 #include <linux/export.h>
@@ -29,7 +27,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
-#include <asm/irq.h>
 /*
   - No shared variables, all the data are CPU local.
   - If a softirq needs serialization, let it serialize itself
@@ -100,13 +97,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
        raw_local_irq_save(flags);
        /*
-         * The preempt tracer hooks into add_preempt_count and will break
+         * The preempt tracer hooks into preempt_count_add and will break
         * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
         * is set and before current->softirq_enabled is cleared.
         * We must manually increment preempt_count here and manually
         * call the trace_preempt_off later.
         */
-        preempt_count() += cnt;
+        __preempt_count_add(cnt);
        /*
         * Were softirqs turned off above:
         */
@@ -120,7 +117,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-        add_preempt_count(cnt);
+        preempt_count_add(cnt);
        barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +131,11 @@ EXPORT_SYMBOL(local_bh_disable);
 static void __local_bh_enable(unsigned int cnt)
 {
-        WARN_ON_ONCE(in_irq());
        WARN_ON_ONCE(!irqs_disabled());
        if (softirq_count() == cnt)
                trace_softirqs_on(_RET_IP_);
-        sub_preempt_count(cnt);
+        preempt_count_sub(cnt);
 }
 /*
@@ -149,6 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
 */
 void _local_bh_enable(void)
 {
+        WARN_ON_ONCE(in_irq());
        __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
 }
@@ -169,12 +166,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
+        preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
-        if (unlikely(!in_interrupt() && local_softirq_pending()))
+        if (unlikely(!in_interrupt() && local_softirq_pending())) {
+                /*
+                 * Run softirq if any pending. And do it in its own stack
+                 * as we may be calling this deep in a task call stack already.
+                 */
                do_softirq();
+        }
-        dec_preempt_count();
+        preempt_count_dec();
 #ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_enable();
 #endif
@@ -256,7 +258,7 @@ restart:
                                       " exited with %08x?\n", vec_nr,
                                       softirq_to_name[vec_nr], h->action,
                                       prev_count, preempt_count());
-                                preempt_count() = prev_count;
+                                preempt_count_set(prev_count);
                        }
                        rcu_bh_qs(cpu);
@@ -280,10 +282,11 @@ restart:
        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
+        WARN_ON_ONCE(in_interrupt());
        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
-#ifndef __ARCH_HAS_DO_SOFTIRQ
 asmlinkage void do_softirq(void)
 {
@@ -298,13 +301,11 @@ asmlinkage void do_softirq(void)
        pending = local_softirq_pending();
        if (pending)
-                __do_softirq();
+                do_softirq_own_stack();
        local_irq_restore(flags);
 }
-#endif
 /*
 * Enter an interrupt context.
 */
@@ -329,15 +330,21 @@ void irq_enter(void)
 static inline void invoke_softirq(void)
 {
        if (!force_irqthreads) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
                /*
                 * We can safely execute softirq on the current stack if
                 * it is the irq stack, because it should be near empty
-                 * at this stage. But we have no way to know if the arch
+                 * at this stage.
-                 * calls irq_exit() on the irq stack. So call softirq
-                 * in its own stack to prevent from any overrun on top
-                 * of a potentially deep task stack.
                 */
-                do_softirq();
+                __do_softirq();
+#else
+                /*
+                 * Otherwise, irq_exit() is called on the task stack that can
+                 * be potentially deep already. So call softirq in its own stack
+                 * to prevent from any overrun.
+                 */
+                do_softirq_own_stack();
+#endif
        } else {
                wakeup_softirqd();
        }
@@ -369,7 +376,7 @@ void irq_exit(void)
        account_irq_exit_time(current);
        trace_hardirq_exit();
-        sub_preempt_count(HARDIRQ_OFFSET);
+        preempt_count_sub(HARDIRQ_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
@@ -618,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
 }
 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
-/*
- * Remote softirq bits
- */
-DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
-EXPORT_PER_CPU_SYMBOL(softirq_work_list);
-static void __local_trigger(struct call_single_data *cp, int softirq)
-{
-        struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
-        list_add_tail(&cp->list, head);
-        /* Trigger the softirq only if the list was previously empty.  */
-        if (head->next == &cp->list)
-                raise_softirq_irqoff(softirq);
-}
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static void remote_softirq_receive(void *data)
-{
-        struct call_single_data *cp = data;
-        unsigned long flags;
-        int softirq;
-        softirq = *(int *)cp->info;
-        local_irq_save(flags);
-        __local_trigger(cp, softirq);
-        local_irq_restore(flags);
-}
-static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
-        if (cpu_online(cpu)) {
-                cp->func = remote_softirq_receive;
-                cp->info = &softirq;
-                cp->flags = 0;
-                __smp_call_function_single(cpu, cp, 0);
-                return 0;
-        }
-        return 1;
-}
-#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
-static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
-        return 1;
-}
-#endif
-/**
- * __send_remote_softirq - try to schedule softirq work on a remote cpu
- * @cp: private SMP call function data area
- * @cpu: the remote cpu
- * @this_cpu: the currently executing cpu
- * @softirq: the softirq for the work
- *
- * Attempt to schedule softirq work on a remote cpu.  If this cannot be
- * done, the work is instead queued up on the local cpu.
- *
- * Interrupts must be disabled.
- */
-void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
-{
-        if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
-                __local_trigger(cp, softirq);
-}
-EXPORT_SYMBOL(__send_remote_softirq);
-/**
- * send_remote_softirq - try to schedule softirq work on a remote cpu
- * @cp: private SMP call function data area
- * @cpu: the remote cpu
- * @softirq: the softirq for the work
- *
- * Like __send_remote_softirq except that disabling interrupts and
- * computing the current cpu is done for the caller.
- */
-void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
-        unsigned long flags;
-        int this_cpu;
-        local_irq_save(flags);
-        this_cpu = smp_processor_id();
-        __send_remote_softirq(cp, cpu, this_cpu, softirq);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL(send_remote_softirq);
-static int remote_softirq_cpu_notify(struct notifier_block *self,
-                                               unsigned long action, void *hcpu)
-{
-        /*
-         * If a CPU goes away, splice its entries to the current CPU
-         * and trigger a run of the softirq
-         */
-        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-                int cpu = (unsigned long) hcpu;
-                int i;
-                local_irq_disable();
-                for (i = 0; i < NR_SOFTIRQS; i++) {
-                        struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
-                        struct list_head *local_head;
-                        if (list_empty(head))
-                                continue;
-                        local_head = &__get_cpu_var(softirq_work_list[i]);
-                        list_splice_init(head, local_head);
-                        raise_softirq_irqoff(i);
-                }
-                local_irq_enable();
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block remote_softirq_cpu_notifier = {
-        .notifier_call  = remote_softirq_cpu_notify,
-};
 void __init softirq_init(void)
 {
        int cpu;
        for_each_possible_cpu(cpu) {
-                int i;
                per_cpu(tasklet_vec, cpu).tail =
                        &per_cpu(tasklet_vec, cpu).head;
                per_cpu(tasklet_hi_vec, cpu).tail =
                        &per_cpu(tasklet_hi_vec, cpu).head;
-                for (i = 0; i < NR_SOFTIRQS; i++)
-                        INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
        }
-        register_hotcpu_notifier(&remote_softirq_cpu_notifier);
        open_softirq(TASKLET_SOFTIRQ, tasklet_action);
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
@@ -771,6 +649,10 @@ static void run_ksoftirqd(unsigned int cpu)
 {
        local_irq_disable();
        if (local_softirq_pending()) {
+                /*
+                 * We can safely run softirq on inline stack, as we are not deep
+                 * in the task stack here.
+                 */
                __do_softirq();
                rcu_note_context_switch(cpu);
                local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..84571e09c907 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
 #include <linux/kallsyms.h>
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
+#include <linux/lglock.h>
 /*
 * Structure to determine completion condition and record errors.  May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
+/*
+ * Avoids a race between stop_two_cpus and global stop_cpus, where
+ * the stoppers could get queued up in reverse order, leading to
+ * system deadlock. Using an lglock means stop_two_cpus remains
+ * relatively cheap.
+ */
+DEFINE_STATIC_LGLOCK(stop_cpus_lock);
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
        memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
        return done.executed ? done.ret : -ENOENT;
 }
+/* This controls the threads on each CPU. */
+enum multi_stop_state {
+        /* Dummy starting state for thread. */
+        MULTI_STOP_NONE,
+        /* Awaiting everyone to be scheduled. */
+        MULTI_STOP_PREPARE,
+        /* Disable interrupts. */
+        MULTI_STOP_DISABLE_IRQ,
+        /* Run the function */
+        MULTI_STOP_RUN,
+        /* Exit */
+        MULTI_STOP_EXIT,
+};
+struct multi_stop_data {
+        int                     (*fn)(void *);
+        void                    *data;
+        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+        unsigned int            num_threads;
+        const struct cpumask    *active_cpus;
+        enum multi_stop_state   state;
+        atomic_t                thread_ack;
+};
+static void set_state(struct multi_stop_data *msdata,
+                      enum multi_stop_state newstate)
+{
+        /* Reset ack counter. */
+        atomic_set(&msdata->thread_ack, msdata->num_threads);
+        smp_wmb();
+        msdata->state = newstate;
+}
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct multi_stop_data *msdata)
+{
+        if (atomic_dec_and_test(&msdata->thread_ack))
+                set_state(msdata, msdata->state + 1);
+}
+/* This is the cpu_stop function which stops the CPU. */
+static int multi_cpu_stop(void *data)
+{
+        struct multi_stop_data *msdata = data;
+        enum multi_stop_state curstate = MULTI_STOP_NONE;
+        int cpu = smp_processor_id(), err = 0;
+        unsigned long flags;
+        bool is_active;
+        /*
+         * When called from stop_machine_from_inactive_cpu(), irq might
+         * already be disabled.  Save the state and restore it on exit.
+         */
+        local_save_flags(flags);
+        if (!msdata->active_cpus)
+                is_active = cpu == cpumask_first(cpu_online_mask);
+        else
+                is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+        /* Simple state machine */
+        do {
+                /* Chill out and ensure we re-read multi_stop_state. */
+                cpu_relax();
+                if (msdata->state != curstate) {
+                        curstate = msdata->state;
+                        switch (curstate) {
+                        case MULTI_STOP_DISABLE_IRQ:
+                                local_irq_disable();
+                                hard_irq_disable();
+                                break;
+                        case MULTI_STOP_RUN:
+                                if (is_active)
+                                        err = msdata->fn(msdata->data);
+                                break;
+                        default:
+                                break;
+                        }
+                        ack_state(msdata);
+                }
+        } while (curstate != MULTI_STOP_EXIT);
+        local_irq_restore(flags);
+        return err;
+}
+struct irq_cpu_stop_queue_work_info {
+        int cpu1;
+        int cpu2;
+        struct cpu_stop_work *work1;
+        struct cpu_stop_work *work2;
+};
+/*
+ * This function is always run with irqs and preemption disabled.
+ * This guarantees that both work1 and work2 get queued, before
+ * our local migrate thread gets the chance to preempt us.
+ */
+static void irq_cpu_stop_queue_work(void *arg)
+{
+        struct irq_cpu_stop_queue_work_info *info = arg;
+        cpu_stop_queue_work(info->cpu1, info->work1);
+        cpu_stop_queue_work(info->cpu2, info->work2);
+}
+/**
+ * stop_two_cpus - stops two cpus
+ * @cpu1: the cpu to stop
+ * @cpu2: the other cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Stops both the current and specified CPU and runs @fn on one of them.
+ *
+ * returns when both are completed.
+ */
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        struct cpu_stop_work work1, work2;
+        struct irq_cpu_stop_queue_work_info call_args;
+        struct multi_stop_data msdata;
+        preempt_disable();
+        msdata = (struct multi_stop_data){
+                .fn = fn,
+                .data = arg,
+                .num_threads = 2,
+                .active_cpus = cpumask_of(cpu1),
+        };
+        work1 = work2 = (struct cpu_stop_work){
+                .fn = multi_cpu_stop,
+                .arg = &msdata,
+                .done = &done
+        };
+        call_args = (struct irq_cpu_stop_queue_work_info){
+                .cpu1 = cpu1,
+                .cpu2 = cpu2,
+                .work1 = &work1,
+                .work2 = &work2,
+        };
+        cpu_stop_init_done(&done, 2);
+        set_state(&msdata, MULTI_STOP_PREPARE);
+        /*
+         * If we observe both CPUs active we know _cpu_down() cannot yet have
+         * queued its stop_machine works and therefore ours will get executed
+         * first. Or its not either one of our CPUs that's getting unplugged,
+         * in which case we don't care.
+         *
+         * This relies on the stopper workqueues to be FIFO.
+         */
+        if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+                preempt_enable();
+                return -ENOENT;
+        }
+        lg_local_lock(&stop_cpus_lock);
+        /*
+         * Queuing needs to be done by the lowest numbered CPU, to ensure
+         * that works are always queued in the same order on every CPU.
+         * This prevents deadlocks.
+         */
+        smp_call_function_single(min(cpu1, cpu2),
+                                 &irq_cpu_stop_queue_work,
+                                 &call_args, 0);
+        lg_local_unlock(&stop_cpus_lock);
+        preempt_enable();
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
 /**
 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
 * @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
         * preempted by a stopper which might wait for other stoppers
         * to enter @fn which can lead to deadlock.
         */
-        preempt_disable();
+        lg_global_lock(&stop_cpus_lock);
        for_each_cpu(cpu, cpumask)
                cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
-        preempt_enable();
+        lg_global_unlock(&stop_cpus_lock);
 }
 static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
 #ifdef CONFIG_STOP_MACHINE
-/* This controls the threads on each CPU. */
-enum stopmachine_state {
-        /* Dummy starting state for thread. */
-        STOPMACHINE_NONE,
-        /* Awaiting everyone to be scheduled. */
-        STOPMACHINE_PREPARE,
-        /* Disable interrupts. */
-        STOPMACHINE_DISABLE_IRQ,
-        /* Run the function */
-        STOPMACHINE_RUN,
-        /* Exit */
-        STOPMACHINE_EXIT,
-};
-struct stop_machine_data {
-        int                     (*fn)(void *);
-        void                    *data;
-        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-        unsigned int            num_threads;
-        const struct cpumask    *active_cpus;
-        enum stopmachine_state  state;
-        atomic_t                thread_ack;
-};
-static void set_state(struct stop_machine_data *smdata,
-                      enum stopmachine_state newstate)
-{
-        /* Reset ack counter. */
-        atomic_set(&smdata->thread_ack, smdata->num_threads);
-        smp_wmb();
-        smdata->state = newstate;
-}
-/* Last one to ack a state moves to the next state. */
-static void ack_state(struct stop_machine_data *smdata)
-{
-        if (atomic_dec_and_test(&smdata->thread_ack))
-                set_state(smdata, smdata->state + 1);
-}
-/* This is the cpu_stop function which stops the CPU. */
-static int stop_machine_cpu_stop(void *data)
-{
-        struct stop_machine_data *smdata = data;
-        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        int cpu = smp_processor_id(), err = 0;
-        unsigned long flags;
-        bool is_active;
-        /*
-         * When called from stop_machine_from_inactive_cpu(), irq might
-         * already be disabled.  Save the state and restore it on exit.
-         */
-        local_save_flags(flags);
-        if (!smdata->active_cpus)
-                is_active = cpu == cpumask_first(cpu_online_mask);
-        else
-                is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-        /* Simple state machine */
-        do {
-                /* Chill out and ensure we re-read stopmachine_state. */
-                cpu_relax();
-                if (smdata->state != curstate) {
-                        curstate = smdata->state;
-                        switch (curstate) {
-                        case STOPMACHINE_DISABLE_IRQ:
-                                local_irq_disable();
-                                hard_irq_disable();
-                                break;
-                        case STOPMACHINE_RUN:
-                                if (is_active)
-                                        err = smdata->fn(smdata->data);
-                                break;
-                        default:
-                                break;
-                        }
-                        ack_state(smdata);
-                }
-        } while (curstate != STOPMACHINE_EXIT);
-        local_irq_restore(flags);
-        return err;
-}
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-        struct stop_machine_data smdata = { .fn = fn, .data = data,
+        struct multi_stop_data msdata = {
-                                            .num_threads = num_online_cpus(),
+                .fn = fn,
-                                            .active_cpus = cpus };
+                .data = data,
+                .num_threads = num_online_cpus(),
+                .active_cpus = cpus,
+        };
        if (!stop_machine_initialized) {
                /*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                unsigned long flags;
                int ret;
-                WARN_ON_ONCE(smdata.num_threads != 1);
+                WARN_ON_ONCE(msdata.num_threads != 1);
                local_irq_save(flags);
                hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
        }
        /* Set the initial state and stop all online cpus. */
-        set_state(&smdata, STOPMACHINE_PREPARE);
+        set_state(&msdata, MULTI_STOP_PREPARE);
-        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+        return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
 }
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
 int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
                                  const struct cpumask *cpus)
 {
-        struct stop_machine_data smdata = { .fn = fn, .data = data,
+        struct multi_stop_data msdata = { .fn = fn, .data = data,
                                            .active_cpus = cpus };
        struct cpu_stop_done done;
        int ret;
        /* Local CPU must be inactive and CPU hotplug in progress. */
        BUG_ON(cpu_active(raw_smp_processor_id()));
-        smdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
+        msdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
        /* No proper task established and can't sleep - busy wait for lock. */
        while (!mutex_trylock(&stop_cpus_mutex))
                cpu_relax();
        /* Schedule work on other CPUs and execute directly for local CPU */
-        set_state(&smdata, STOPMACHINE_PREPARE);
+        set_state(&msdata, MULTI_STOP_PREPARE);
        cpu_stop_init_done(&done, num_active_cpus());
-        queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+        queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
                             &done);
-        ret = stop_machine_cpu_stop(&smdata);
+        ret = multi_cpu_stop(&msdata);
        /* Busy wait for completion. */
        while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index c18ecca575b4..c72311324ea7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,7 +16,6 @@
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
-#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/capability.h>
 #include <linux/device.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2f06f3c6a3f..34a604726d0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
-static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
+static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
 static int sysrq_sysctl_handler(ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_scan_period_reset",
-                .data           = &sysctl_numa_balancing_scan_period_reset,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing_scan_period_max_ms",
                .data           = &sysctl_numa_balancing_scan_period_max,
                .maxlen         = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "numa_balancing_settle_count",
+                .data           = &sysctl_numa_balancing_settle_count,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_migrate_deferred",
+                .data           = &sysctl_numa_balancing_migrate_deferred,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
        {
@@ -962,9 +969,10 @@ static struct ctl_table kern_table[] = {
        {
                .procname       = "hung_task_check_count",
                .data           = &sysctl_hung_task_check_count,
-                .maxlen         = sizeof(unsigned long),
+                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_doulongvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "hung_task_timeout_secs",
@@ -1049,6 +1057,7 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
                .proc_handler   = perf_proc_update_handler,
+                .extra1         = &one,
        },
        {
                .procname       = "perf_cpu_time_max_percent",
@@ -2214,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
                        *i = val;
                } else {
                        val = convdiv * (*i) / convmul;
-                        if (!first)
+                        if (!first) {
                                err = proc_put_char(&buffer, &left, '\t');
+                                if (err)
+                                        break;
+                        }
                        err = proc_put_long(&buffer, &left, val, false);
                        if (err)
                                break;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b609213ca9a2..653cbbd9e7ad 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file,
                        if (get_user(value, vec + i))
                                goto out_kfree;
-                        str += snprintf(str, end - str, "%lu\t", value);
+                        str += scnprintf(str, end - str, "%lu\t", value);
                }
                result = kernel_write(file, buffer, str - buffer, 0);
@@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file,
                        if (get_user(value, vec + i))
                                goto out_kfree;
-                        str += snprintf(str, end - str, "%lu\t", value);
+                        str += scnprintf(str, end - str, "%lu\t", value);
                }
                result = kernel_write(file, buffer, str - buffer, 0);
@@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file,
                if (get_user(dnaddr, (__le16 __user *)newval))
                        goto out;
-                len = snprintf(buf, sizeof(buf), "%hu.%hu",
+                len = scnprintf(buf, sizeof(buf), "%hu.%hu",
                                le16_to_cpu(dnaddr) >> 10,
                                le16_to_cpu(dnaddr) & 0x3ff);
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
new file mode 100644
index 000000000000..3e9868d47535
--- /dev/null
+++ b/kernel/system_certificates.S
@@ -0,0 +1,20 @@
+#include <linux/export.h>
+#include <linux/init.h>
+        __INITRODATA
+        .align 8
+        .globl VMLINUX_SYMBOL(system_certificate_list)
+VMLINUX_SYMBOL(system_certificate_list):
+__cert_list_start:
+        .incbin "kernel/x509_certificate_list"
+__cert_list_end:
+        .align 8
+        .globl VMLINUX_SYMBOL(system_certificate_list_size)
+VMLINUX_SYMBOL(system_certificate_list_size):
+#ifdef CONFIG_64BIT
+        .quad __cert_list_end - __cert_list_start
+#else
+        .long __cert_list_end - __cert_list_start
+#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
new file mode 100644
index 000000000000..52ebc70263f4
--- /dev/null
+++ b/kernel/system_keyring.c
@@ -0,0 +1,105 @@
+/* System trusted keyring for trusted public keys
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include "module-internal.h"
+struct key *system_trusted_keyring;
+EXPORT_SYMBOL_GPL(system_trusted_keyring);
+extern __initconst const u8 system_certificate_list[];
+extern __initconst const unsigned long system_certificate_list_size;
+/*
+ * Load the compiled-in keys
+ */
+static __init int system_trusted_keyring_init(void)
+{
+        pr_notice("Initialise system trusted keyring\n");
+        system_trusted_keyring =
+                keyring_alloc(".system_keyring",
+                              KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
+                              ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                              KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+                              KEY_ALLOC_NOT_IN_QUOTA, NULL);
+        if (IS_ERR(system_trusted_keyring))
+                panic("Can't allocate system trusted keyring\n");
+        set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
+        return 0;
+}
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(system_trusted_keyring_init);
+/*
+ * Load the compiled-in list of X.509 certificates.
+ */
+static __init int load_system_certificate_list(void)
+{
+        key_ref_t key;
+        const u8 *p, *end;
+        size_t plen;
+        pr_notice("Loading compiled-in X.509 certificates\n");
+        p = system_certificate_list;
+        end = p + system_certificate_list_size;
+        while (p < end) {
+                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+                 * than 256 bytes in size.
+                 */
+                if (end - p < 4)
+                        goto dodgy_cert;
+                if (p[0] != 0x30 &&
+                    p[1] != 0x82)
+                        goto dodgy_cert;
+                plen = (p[2] << 8) | p[3];
+                plen += 4;
+                if (plen > end - p)
+                        goto dodgy_cert;
+                key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
+                                           "asymmetric",
+                                           NULL,
+                                           p,
+                                           plen,
+                                           ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                           KEY_USR_VIEW | KEY_USR_READ),
+                                           KEY_ALLOC_NOT_IN_QUOTA |
+                                           KEY_ALLOC_TRUSTED);
+                if (IS_ERR(key)) {
+                        pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
+                               PTR_ERR(key));
+                } else {
+                        pr_notice("Loaded X.509 cert '%s'\n",
+                                  key_ref_to_ptr(key)->description);
+                        key_ref_put(key);
+                }
+                p += plen;
+        }
+        return 0;
+dodgy_cert:
+        pr_err("Problem parsing in-kernel X.509 certificate list\n");
+        return 0;
+}
+late_initcall(load_system_certificate_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 145bb4d3bd4d..13d2f7cd65db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
        struct listener_list *listeners;
        struct listener *s, *tmp, *s2;
        unsigned int cpu;
+        int ret = 0;
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
@@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
                for_each_cpu(cpu, mask) {
                        s = kmalloc_node(sizeof(struct listener),
                                        GFP_KERNEL, cpu_to_node(cpu));
-                        if (!s)
+                        if (!s) {
+                                ret = -ENOMEM;
                                goto cleanup;
+                        }
                        s->pid = pid;
                        s->valid = 1;
@@ -339,7 +341,7 @@ cleanup:
                }
                up_write(&listeners->sem);
        }
-        return 0;
+        return ret;
 }
 static int parse(struct nlattr *na, struct cpumask *mask)
@@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
        if (!na)
                goto err;
-        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+        if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
+                nla_nest_cancel(skb, na);
                goto err;
+        }
        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
-        if (!ret)
+        if (!ret) {
+                nla_nest_cancel(skb, na);
                goto err;
+        }
        nla_nest_end(skb, na);
        return nla_data(ret);
@@ -667,17 +673,18 @@ err:
        nlmsg_free(rep_skb);
 }
-static struct genl_ops taskstats_ops = {
+static const struct genl_ops taskstats_ops[] = {
-        .cmd            = TASKSTATS_CMD_GET,
+        {
-        .doit           = taskstats_user_cmd,
+                .cmd            = TASKSTATS_CMD_GET,
-        .policy         = taskstats_cmd_get_policy,
+                .doit           = taskstats_user_cmd,
-        .flags          = GENL_ADMIN_PERM,
+                .policy         = taskstats_cmd_get_policy,
-};
+                .flags          = GENL_ADMIN_PERM,
+        },
-static struct genl_ops cgroupstats_ops = {
+        {
-        .cmd            = CGROUPSTATS_CMD_GET,
+                .cmd            = CGROUPSTATS_CMD_GET,
-        .doit           = cgroupstats_user_cmd,
+                .doit           = cgroupstats_user_cmd,
-        .policy         = cgroupstats_cmd_get_policy,
+                .policy         = cgroupstats_cmd_get_policy,
+        },
 };
 /* Needed early in initialization */
@@ -696,26 +703,13 @@ static int __init taskstats_init(void)
 {
        int rc;
-        rc = genl_register_family(&family);
+        rc = genl_register_family_with_ops(&family, taskstats_ops);
        if (rc)
                return rc;
-        rc = genl_register_ops(&family, &taskstats_ops);
-        if (rc < 0)
-                goto err;
-        rc = genl_register_ops(&family, &cgroupstats_ops);
-        if (rc < 0)
-                goto err_cgroup_ops;
        family_registered = 1;
        pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
        return 0;
-err_cgroup_ops:
-        genl_unregister_ops(&family, &taskstats_ops);
-err:
-        genl_unregister_family(&family);
-        return rc;
 }
 /*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2b62fe86f9ec..3ce6e8c5f3fc 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
        # RCU_USER_QS dependency
        depends on HAVE_CONTEXT_TRACKING
        # VIRT_CPU_ACCOUNTING_GEN dependency
-        depends on 64BIT
+        depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
        select NO_HZ_COMMON
        select RCU_USER_QS
        select RCU_NOCB_CPU
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fcef9e4..88c9c65a430d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
        clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
        if (!alarmtimer_get_rtcdev())
-                return -ENOTSUPP;
+                return -EINVAL;
        return hrtimer_get_res(baseid, tp);
 }
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
        if (!alarmtimer_get_rtcdev())
-                return -ENOTSUPP;
+                return -EINVAL;
        *tp = ktime_to_timespec(base->gettime());
        return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 662c5798a685..086ad6043bcb 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
                                     const char *buf, size_t count)
 {
        char name[CS_NAME_LEN];
-        size_t ret = sysfs_get_uname(buf, name, count);
+        ssize_t ret = sysfs_get_uname(buf, name, count);
        struct clock_event_device *ce;
        if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736757f3..ba3e502c955a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
 static inline int __clocksource_watchdog_kthread(void) { return 0; }
 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
+void clocksource_mark_unstable(struct clocksource *cs) { }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 }
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
- * @cs:         Pointer to clocksource
+ * @mult:       cycle to nanosecond multiplier
- *
+ * @shift:      cycle to nanosecond divisor (power of two)
+ * @maxadj:     maximum adjustment value to mult (~11%)
+ * @mask:       bitmask for two's complement subtraction of non 64 bit counters
 */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 {
        u64 max_nsecs, max_cycles;
        /*
         * Calculate the maximum number of cycles that we can pass to the
         * cyc2ns function without overflowing a 64-bit signed result. The
-         * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+         * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
         * which is equivalent to the below.
-         * max_cycles < (2^63)/(cs->mult + cs->maxadj)
+         * max_cycles < (2^63)/(mult + maxadj)
-         * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
+         * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-         * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
+         * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-         * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
+         * max_cycles < 2^(63 - log2(mult + maxadj))
-         * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
+         * max_cycles < 1 << (63 - log2(mult + maxadj))
         * Please note that we add 1 to the result of the log2 to account for
         * any rounding errors, ensure the above inequality is satisfied and
         * no overflow will occur.
         */
-        max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
+        max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
        /*
         * The actual maximum number of cycles we can defer the clocksource is
-         * determined by the minimum of max_cycles and cs->mask.
+         * determined by the minimum of max_cycles and mask.
         * Note: Here we subtract the maxadj to make sure we don't sleep for
         * too long if there's a large negative adjustment.
         */
-        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+        max_cycles = min(max_cycles, mask);
-        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+        max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
-                                        cs->shift);
+        return max_nsecs;
+}
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+        u64 max_nsecs;
+        max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+                                          cs->mask);
        /*
         * To ensure that the clocksource does not wrap whilst we are idle,
         * limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
        return count;
 }
-size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
 {
        size_t ret = cnt;
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
 {
-        size_t ret;
+        ssize_t ret;
        mutex_lock(&clocksource_mutex);
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
 {
        struct clocksource *cs;
        char name[CS_NAME_LEN];
-        size_t ret;
+        ssize_t ret;
        ret = sysfs_get_uname(buf, name, count);
        if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bb2215174f05..af8d1d4f3d55 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
         * called as close as possible to 500 ms before the new second starts.
         * This code is run on a timer.  If the clock is set, that timer
         * may not expire at the correct time.  Thus, we adjust...
+         * We want the clock to be within a couple of ticks from the target.
         */
        if (!ntp_synced()) {
                /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
        }
        getnstimeofday(&now);
-        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
                struct timespec adjust = now;
                fail = -ENODEV;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0b479a6a22bb..0abb36464281 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
 #include <linux/clocksource.h>
 #include <linux/init.h>
 #include <linux/jiffies.h>
+#include <linux/ktime.h>
 #include <linux/kernel.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/syscore_ops.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
 #include <linux/sched_clock.h>
+#include <linux/seqlock.h>
+#include <linux/bitops.h>
 struct clock_data {
+        ktime_t wrap_kt;
        u64 epoch_ns;
-        u32 epoch_cyc;
+        u64 epoch_cyc;
-        u32 epoch_cyc_copy;
+        seqcount_t seq;
        unsigned long rate;
        u32 mult;
        u32 shift;
        bool suspended;
 };
-static void sched_clock_poll(unsigned long wrap_ticks);
+static struct hrtimer sched_clock_timer;
-static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
 static int irqtime = -1;
 core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
        .mult   = NSEC_PER_SEC / HZ,
 };
-static u32 __read_mostly sched_clock_mask = 0xffffffff;
+static u64 __read_mostly sched_clock_mask;
-static u32 notrace jiffy_sched_clock_read(void)
+static u64 notrace jiffy_sched_clock_read(void)
 {
-        return (u32)(jiffies - INITIAL_JIFFIES);
+        /*
+         * We don't need to use get_jiffies_64 on 32-bit arches here
+         * because we register with BITS_PER_LONG
+         */
+        return (u64)(jiffies - INITIAL_JIFFIES);
 }
-static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static u32 __read_mostly (*read_sched_clock_32)(void);
+static u64 notrace read_sched_clock_32_wrapper(void)
+{
+        return read_sched_clock_32();
+}
+static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
        return (cyc * mult) >> shift;
 }
-static unsigned long long notrace sched_clock_32(void)
+unsigned long long notrace sched_clock(void)
 {
        u64 epoch_ns;
-        u32 epoch_cyc;
+        u64 epoch_cyc;
-        u32 cyc;
+        u64 cyc;
+        unsigned long seq;
        if (cd.suspended)
                return cd.epoch_ns;
-        /*
-         * Load the epoch_cyc and epoch_ns atomically.  We do this by
-         * ensuring that we always write epoch_cyc, epoch_ns and
-         * epoch_cyc_copy in strict order, and read them in strict order.
-         * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
-         * the middle of an update, and we should repeat the load.
-         */
        do {
+                seq = raw_read_seqcount_begin(&cd.seq);
                epoch_cyc = cd.epoch_cyc;
-                smp_rmb();
                epoch_ns = cd.epoch_ns;
-                smp_rmb();
+        } while (read_seqcount_retry(&cd.seq, seq));
-        } while (epoch_cyc != cd.epoch_cyc_copy);
        cyc = read_sched_clock();
        cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
 static void notrace update_sched_clock(void)
 {
        unsigned long flags;
-        u32 cyc;
+        u64 cyc;
        u64 ns;
        cyc = read_sched_clock();
        ns = cd.epoch_ns +
                cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
                          cd.mult, cd.shift);
-        /*
-         * Write epoch_cyc and epoch_ns in a way that the update is
-         * detectable in cyc_to_fixed_sched_clock().
-         */
        raw_local_irq_save(flags);
-        cd.epoch_cyc_copy = cyc;
+        raw_write_seqcount_begin(&cd.seq);
-        smp_wmb();
        cd.epoch_ns = ns;
-        smp_wmb();
        cd.epoch_cyc = cyc;
+        raw_write_seqcount_end(&cd.seq);
        raw_local_irq_restore(flags);
 }
-static void sched_clock_poll(unsigned long wrap_ticks)
+static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
-        mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
        update_sched_clock();
+        hrtimer_forward_now(hrt, cd.wrap_kt);
+        return HRTIMER_RESTART;
 }
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
+void __init sched_clock_register(u64 (*read)(void), int bits,
+                                 unsigned long rate)
 {
-        unsigned long r, w;
+        unsigned long r;
        u64 res, wrap;
        char r_unit;
        if (cd.rate > rate)
                return;
-        BUG_ON(bits > 32);
        WARN_ON(!irqs_disabled());
        read_sched_clock = read;
-        sched_clock_mask = (1ULL << bits) - 1;
+        sched_clock_mask = CLOCKSOURCE_MASK(bits);
        cd.rate = rate;
        /* calculate the mult/shift to convert counter ticks to ns. */
-        clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
+        clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
        r = rate;
        if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
                r_unit = ' ';
        /* calculate how many ns until we wrap */
-        wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
+        wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
-        do_div(wrap, NSEC_PER_MSEC);
+        cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-        w = wrap;
        /* calculate the ns resolution of this counter */
        res = cyc_to_ns(1ULL, cd.mult, cd.shift);
-        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
+        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
-                bits, r, r_unit, res, w);
+                bits, r, r_unit, res, wrap);
-        /*
-         * Start the timer to keep sched_clock() properly updated and
-         * sets the initial epoch.
-         */
-        sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
        update_sched_clock();
        /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
        pr_debug("Registered %pF as sched_clock source\n", read);
 }
-unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
+void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
-unsigned long long notrace sched_clock(void)
 {
-        return sched_clock_func();
+        read_sched_clock_32 = read;
+        sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
 }
 void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
         * make it the final one one.
         */
        if (read_sched_clock == jiffy_sched_clock_read)
-                setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
+                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
-        sched_clock_poll(sched_clock_timer.data);
+        update_sched_clock();
+        /*
+         * Start the timer to keep sched_clock() properly updated and
+         * sets the initial epoch.
+         */
+        hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        sched_clock_timer.function = sched_clock_poll;
+        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
 static int sched_clock_suspend(void)
 {
-        sched_clock_poll(sched_clock_timer.data);
+        sched_clock_poll(&sched_clock_timer);
        cd.suspended = true;
        return 0;
 }
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
 static void sched_clock_resume(void)
 {
        cd.epoch_cyc = read_sched_clock();
-        cd.epoch_cyc_copy = cd.epoch_cyc;
        cd.suspended = false;
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb565fed..9532690daaa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
                                        struct clock_event_device *newdev)
 {
        if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+            (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
            (newdev->features & CLOCK_EVT_FEAT_C3STOP))
                return false;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..162b03ab0ad2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 */
 ktime_t tick_next_period;
 ktime_t tick_period;
+/*
+ * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
+ * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
+ * variable has two functions:
+ *
+ * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
+ *    timekeeping lock all at once. Only the CPU which is assigned to do the
+ *    update is handling it.
+ *
+ * 2) Hand off the duty in the NOHZ idle case by setting the value to
+ *    TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
+ *    at it will take over and keep the time keeping alive.  The handover
+ *    procedure also covers cpu hotplug.
+ */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906cad709b..18e71f7fbc2a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
 extern void clockevents_shutdown(struct clock_event_device *dev);
-extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 /*
 * NO_HZ / high resolution timer shared code
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..ea20f7d1ac2c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void)
 /*
 * NO HZ enabled ?
 */
-int tick_nohz_enabled __read_mostly  = 1;
+static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_active  __read_mostly;
 /*
 * Enable / Disable tickless mode
 */
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now, idle;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return -1;
        now = ktime_get();
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now, iowait;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return -1;
        now = ktime_get();
@@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                return false;
        }
-        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+                ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
                return false;
+        }
        if (need_resched())
                return false;
@@ -799,11 +801,6 @@ void tick_nohz_idle_enter(void)
        local_irq_disable();
        ts = &__get_cpu_var(tick_cpu_sched);
-        /*
-         * set ts->inidle unconditionally. even if the system did not
-         * switch to nohz mode the cpu frequency governers rely on the
-         * update of the idle time accounting in tick_nohz_start_idle().
-         */
        ts->inidle = 1;
        __tick_nohz_idle_enter(ts);
@@ -973,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t next;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return;
        local_irq_disable();
@@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)
                local_irq_enable();
                return;
        }
+        tick_nohz_active = 1;
        ts->nohz_mode = NOHZ_MODE_LOWRES;
        /*
@@ -1139,8 +1136,10 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ_COMMON
-        if (tick_nohz_enabled)
+        if (tick_nohz_enabled) {
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+                tick_nohz_active = 1;
+        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 947ba25a95a0..87b4f00284c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        tk->xtime_nsec -= remainder;
        tk->xtime_nsec += 1ULL << tk->shift;
        tk->ntp_error += remainder << tk->ntp_error_shift;
+        tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 * ktime_get_update_offsets - hrtimer helper
 * @offs_real:  pointer to storage for monotonic -> realtime offset
 * @offs_boot:  pointer to storage for monotonic -> boottime offset
+ * @offs_tai:   pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets
- * Called from hrtimer_interupt() or retrigger_next_event()
+ * Called from hrtimer_interrupt() or retrigger_next_event()
 */
 ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
                                                        ktime_t *offs_tai)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b559..1fb08f21302e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
        period = ktime_to_timespec(time);
        ms = period.tv_nsec / 1000000;
-        seq_puts(m, "Timer Stats Version: v0.2\n");
+        seq_puts(m, "Timer Stats Version: v0.3\n");
        seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
        if (atomic_read(&overflow_count))
-                seq_printf(m, "Overflow: %d entries\n",
+                seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
-                        atomic_read(&overflow_count));
+        seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
        for (i = 0; i < nr_entries; i++) {
                entry = entries + i;
-                if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+                if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
                        seq_printf(m, "%4luD, %5d %-16s ",
                                entry->count, entry->pid, entry->comm);
                } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
                          unsigned long data)
 {
-        int preempt_count = preempt_count();
+        int count = preempt_count();
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
        lock_map_release(&lockdep_map);
-        if (preempt_count != preempt_count()) {
+        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
-                          fn, preempt_count, preempt_count());
+                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
-                preempt_count() = preempt_count;
+                preempt_count_set(count);
        }
 }
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
                        /*
                         * The APs use this path later in boot
                         */
-                        base = kmalloc_node(sizeof(*base),
+                        base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                                GFP_KERNEL | __GFP_ZERO,
+                                            cpu_to_node(cpu));
-                                                cpu_to_node(cpu));
                        if (!base)
                                return -ENOMEM;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b8b8560bfb95..f785aef65799 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,6 +26,7 @@
 #include <linux/export.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
+#include <linux/list.h>
 #include <trace/events/block.h>
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;
 static struct trace_array *blk_tr;
 static bool blk_tracer_enabled __read_mostly;
+static LIST_HEAD(running_trace_list);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC   0x1
@@ -107,10 +111,18 @@ record_it:
 * Send out a notify for this process, if we haven't done so since a trace
 * started
 */
-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+static void trace_note_tsk(struct task_struct *tsk)
 {
+        unsigned long flags;
+        struct blk_trace *bt;
        tsk->btrace_seq = blktrace_seq;
-        trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+        spin_lock_irqsave(&running_trace_lock, flags);
+        list_for_each_entry(bt, &running_trace_list, running_list) {
+                trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
+                           sizeof(tsk->comm));
+        }
+        spin_unlock_irqrestore(&running_trace_lock, flags);
 }
 static void trace_note_time(struct blk_trace *bt)
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                goto record_it;
        }
+        if (unlikely(tsk->btrace_seq != blktrace_seq))
+                trace_note_tsk(tsk);
        /*
         * A word about the locking here - we disable interrupts to reserve
         * some space in the relay per-cpu buffer, to prevent an irq
         * from coming in and stepping on our toes.
         */
        local_irq_save(flags);
-        if (unlikely(tsk->btrace_seq != blktrace_seq))
-                trace_note_tsk(bt, tsk);
        t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
        if (t) {
                sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        bt->dir = dir;
        bt->dev = dev;
        atomic_set(&bt->dropped, 0);
+        INIT_LIST_HEAD(&bt->running_list);
        ret = -EIO;
        bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
                .end_lba = cbuts.end_lba,
                .pid = cbuts.pid,
        };
-        memcpy(&buts.name, &cbuts.name, 32);
        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
        if (ret)
                return ret;
-        if (copy_to_user(arg, &buts.name, 32)) {
+        if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
                blk_trace_remove(q);
                return -EFAULT;
        }
@@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
                        blktrace_seq++;
                        smp_mb();
                        bt->trace_state = Blktrace_running;
+                        spin_lock_irq(&running_trace_lock);
+                        list_add(&bt->running_list, &running_trace_list);
+                        spin_unlock_irq(&running_trace_lock);
                        trace_note_time(bt);
                        ret = 0;
@@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
        } else {
                if (bt->trace_state == Blktrace_running) {
                        bt->trace_state = Blktrace_stopped;
+                        spin_lock_irq(&running_trace_lock);
+                        list_del_init(&bt->running_list);
+                        spin_unlock_irq(&running_trace_lock);
                        relay_flush(bt->rchan);
                        ret = 0;
                }
@@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
        if (atomic_dec_and_test(&blk_probes_ref))
                blk_unregister_tracepoints();
+        spin_lock_irq(&running_trace_lock);
+        list_del(&bt->running_list);
+        spin_unlock_irq(&running_trace_lock);
        blk_trace_free(bt);
        return 0;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 03cf44ac54d3..72a0f81dc5a8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
                return -EINVAL;
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
        int ret;
-        if (ftrace_disabled)
-                return -ENODEV;
        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
                return -EBUSY;
@@ -781,7 +775,7 @@ static int ftrace_profile_init(void)
        int cpu;
        int ret = 0;
-        for_each_online_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                ret = ftrace_profile_init_cpu(cpu);
                if (ret)
                        break;
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)
 static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
        bool hash_enable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
+        ret = __register_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up++;
        command |= FTRACE_UPDATE_CALLS;
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
        return 0;
 }
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
        bool hash_disable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
+        ret = __unregister_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up--;
        /*
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
        }
        if (!command || !ftrace_enabled)
-                return;
+                return 0;
        ftrace_run_update_code(command);
+        return 0;
 }
 static void ftrace_startup_sysctl(void)
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)
        if (i == FTRACE_FUNC_HASHSIZE)
                return;
-        ret = __register_ftrace_function(&trace_probe_ops);
+        ret = ftrace_startup(&trace_probe_ops, 0);
-        if (!ret)
-                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
 static void __disable_ftrace_function_probe(void)
 {
-        int ret;
        int i;
        if (!ftrace_probe_registered)
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)
        }
        /* no more funcs left */
-        ret = __unregister_ftrace_function(&trace_probe_ops);
+        ftrace_shutdown(&trace_probe_ops, 0);
-        if (!ret)
-                ftrace_shutdown(&trace_probe_ops, 0);
        ftrace_probe_registered = 0;
 }
@@ -3307,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob)
 static LIST_HEAD(ftrace_commands);
 static DEFINE_MUTEX(ftrace_cmd_mutex);
-int register_ftrace_command(struct ftrace_func_command *cmd)
+/*
+ * Currently we only register ftrace commands from __init, so mark this
+ * __init too.
+ */
+__init int register_ftrace_command(struct ftrace_func_command *cmd)
 {
        struct ftrace_func_command *p;
        int ret = 0;
@@ -3326,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd)
        return ret;
 }
-int unregister_ftrace_command(struct ftrace_func_command *cmd)
+/*
+ * Currently we only unregister ftrace commands from __init, so mark
+ * this __init too.
+ */
+__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
 {
        struct ftrace_func_command *p, *n;
        int ret = -ENODEV;
@@ -3641,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
 static int __init set_graph_function(char *str)
 {
@@ -3659,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf)
                func = strsep(&buf, ",");
                /* we allow only one expression at a time */
                ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
-                                      func);
+                                      FTRACE_GRAPH_MAX_FUNCS, func);
                if (ret)
                        printk(KERN_DEBUG "ftrace: function %s not "
                                          "traceable\n", func);
@@ -3776,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = {
 static DEFINE_MUTEX(graph_lock);
 int ftrace_graph_count;
-int ftrace_graph_filter_enabled;
+int ftrace_graph_notrace_count;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+struct ftrace_graph_data {
+        unsigned long *table;
+        size_t size;
+        int *count;
+        const struct seq_operations *seq_ops;
+};
 static void *
 __g_next(struct seq_file *m, loff_t *pos)
 {
-        if (*pos >= ftrace_graph_count)
+        struct ftrace_graph_data *fgd = m->private;
+        if (*pos >= *fgd->count)
                return NULL;
-        return &ftrace_graph_funcs[*pos];
+        return &fgd->table[*pos];
 }
 static void *
@@ -3796,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos)
 static void *g_start(struct seq_file *m, loff_t *pos)
 {
+        struct ftrace_graph_data *fgd = m->private;
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
-        if (!ftrace_graph_filter_enabled && !*pos)
+        if (!*fgd->count && !*pos)
                return (void *)1;
        return __g_next(m, pos);
@@ -3835,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = {
 };
 static int
-ftrace_graph_open(struct inode *inode, struct file *file)
+__ftrace_graph_open(struct inode *inode, struct file *file,
+                    struct ftrace_graph_data *fgd)
 {
        int ret = 0;
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        mutex_lock(&graph_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC)) {
-                ftrace_graph_filter_enabled = 0;
+                *fgd->count = 0;
-                ftrace_graph_count = 0;
+                memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
-                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
        mutex_unlock(&graph_lock);
-        if (file->f_mode & FMODE_READ)
+        if (file->f_mode & FMODE_READ) {
-                ret = seq_open(file, &ftrace_graph_seq_ops);
+                ret = seq_open(file, fgd->seq_ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = fgd;
+                }
+        } else
+                file->private_data = fgd;
        return ret;
 }
 static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+        struct ftrace_graph_data *fgd;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+        if (fgd == NULL)
+                return -ENOMEM;
+        fgd->table = ftrace_graph_funcs;
+        fgd->size = FTRACE_GRAPH_MAX_FUNCS;
+        fgd->count = &ftrace_graph_count;
+        fgd->seq_ops = &ftrace_graph_seq_ops;
+        return __ftrace_graph_open(inode, file, fgd);
+}
+static int
+ftrace_graph_notrace_open(struct inode *inode, struct file *file)
+{
+        struct ftrace_graph_data *fgd;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+        if (fgd == NULL)
+                return -ENOMEM;
+        fgd->table = ftrace_graph_notrace_funcs;
+        fgd->size = FTRACE_GRAPH_MAX_FUNCS;
+        fgd->count = &ftrace_graph_notrace_count;
+        fgd->seq_ops = &ftrace_graph_seq_ops;
+        return __ftrace_graph_open(inode, file, fgd);
+}
+static int
 ftrace_graph_release(struct inode *inode, struct file *file)
 {
-        if (file->f_mode & FMODE_READ)
+        if (file->f_mode & FMODE_READ) {
+                struct seq_file *m = file->private_data;
+                kfree(m->private);
                seq_release(inode, file);
+        } else {
+                kfree(file->private_data);
+        }
        return 0;
 }
 static int
-ftrace_set_func(unsigned long *array, int *idx, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
 {
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
@@ -3879,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        /* decode regex */
        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
-        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
+        if (!not && *idx >= size)
                return -EBUSY;
        search_len = strlen(search);
@@ -3907,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
                                fail = 0;
                                if (!exists) {
                                        array[(*idx)++] = rec->ip;
-                                        if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+                                        if (*idx >= size)
                                                goto out;
                                }
                        } else {
@@ -3925,8 +3995,6 @@ out:
        if (fail)
                return -EINVAL;
-        ftrace_graph_filter_enabled = !!(*idx);
        return 0;
 }
@@ -3935,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
 {
        struct trace_parser parser;
-        ssize_t read, ret;
+        ssize_t read, ret = 0;
+        struct ftrace_graph_data *fgd = file->private_data;
        if (!cnt)
                return 0;
-        mutex_lock(&graph_lock);
+        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
+                return -ENOMEM;
-        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
-                ret = -ENOMEM;
-                goto out_unlock;
-        }
        read = trace_get_user(&parser, ubuf, cnt, ppos);
        if (read >= 0 && trace_parser_loaded((&parser))) {
                parser.buffer[parser.idx] = 0;
+                mutex_lock(&graph_lock);
                /* we allow only one expression at a time */
-                ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+                ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
-                                        parser.buffer);
+                                      parser.buffer);
-                if (ret)
-                        goto out_free;
+                mutex_unlock(&graph_lock);
        }
-        ret = read;
+        if (!ret)
+                ret = read;
-out_free:
        trace_parser_put(&parser);
-out_unlock:
-        mutex_unlock(&graph_lock);
        return ret;
 }
@@ -3976,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = {
        .llseek         = ftrace_filter_lseek,
        .release        = ftrace_graph_release,
 };
+static const struct file_operations ftrace_graph_notrace_fops = {
+        .open           = ftrace_graph_notrace_open,
+        .read           = seq_read,
+        .write          = ftrace_graph_write,
+        .llseek         = ftrace_filter_lseek,
+        .release        = ftrace_graph_release,
+};
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
@@ -3997,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        trace_create_file("set_graph_function", 0444, d_tracer,
                                    NULL,
                                    &ftrace_graph_fops);
+        trace_create_file("set_graph_notrace", 0444, d_tracer,
+                                    NULL,
+                                    &ftrace_graph_notrace_fops);
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
        return 0;
@@ -4290,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)                   \
+# define ftrace_startup(ops, command)                                   \
-        ({                                              \
+        ({                                                              \
-                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                int ___ret = __register_ftrace_function(ops);           \
-                0;                                      \
+                if (!___ret)                                            \
+                        (ops)->flags |= FTRACE_OPS_FL_ENABLED;          \
+                ___ret;                                                 \
        })
-# define ftrace_shutdown(ops, command)  do { } while (0)
+# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
@@ -4320,12 +4399,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
         */
        preempt_disable_notrace();
        trace_recursion_set(TRACE_CONTROL_BIT);
+        /*
+         * Control funcs (perf) uses RCU. Only trace if
+         * RCU is currently active.
+         */
+        if (!rcu_is_watching())
+                goto out;
        do_for_each_ftrace_op(op, ftrace_control_list) {
                if (!(op->flags & FTRACE_OPS_FL_STUB) &&
                    !ftrace_function_local_disabled(op) &&
                    ftrace_ops_test(op, ip, regs))
                        op->func(ip, parent_ip, op, regs);
        } while_for_each_ftrace_op(op);
+ out:
        trace_recursion_clear(TRACE_CONTROL_BIT);
        preempt_enable_notrace();
 }
@@ -4695,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
-        ret = __register_ftrace_function(ops);
+        ret = ftrace_startup(ops, 0);
-        if (!ret)
-                ret = ftrace_startup(ops, 0);
        mutex_unlock(&ftrace_lock);
@@ -4716,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
        int ret;
        mutex_lock(&ftrace_lock);
-        ret = __unregister_ftrace_function(ops);
+        ret = ftrace_shutdown(ops, 0);
-        if (!ret)
-                ftrace_shutdown(ops, 0);
        mutex_unlock(&ftrace_lock);
        return ret;
@@ -4912,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
        return NOTIFY_DONE;
 }
+/* Just a place holder for function graph */
+static struct ftrace_ops fgraph_ops __read_mostly = {
+        .func           = ftrace_stub,
+        .flags          = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
+                                FTRACE_OPS_FL_RECURSION_SAFE,
+};
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                        trace_func_graph_ent_t entryfunc)
 {
@@ -4938,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_return = retfunc;
        ftrace_graph_entry = entryfunc;
-        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+        ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -4955,7 +5046,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_active--;
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7974ba20557d..9d20cd9743ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr)
        mutex_unlock(&trace_types_lock);
 }
-int filter_current_check_discard(struct ring_buffer *buffer,
+int filter_check_discard(struct ftrace_event_file *file, void *rec,
-                                 struct ftrace_event_call *call, void *rec,
+                         struct ring_buffer *buffer,
-                                 struct ring_buffer_event *event)
+                         struct ring_buffer_event *event)
 {
-        return filter_check_discard(call, rec, buffer, event);
+        if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+            !filter_match_preds(file->filter, rec)) {
+                ring_buffer_discard_commit(buffer, event);
+                return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(filter_check_discard);
+int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+                              struct ring_buffer *buffer,
+                              struct ring_buffer_event *event)
+{
+        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
+            !filter_match_preds(call->filter, rec)) {
+                ring_buffer_discard_commit(buffer, event);
+                return 1;
+        }
+        return 0;
 }
-EXPORT_SYMBOL_GPL(filter_current_check_discard);
+EXPORT_SYMBOL_GPL(call_filter_check_discard);
 cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
@@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
        if (isspace(ch)) {
                parser->buffer[parser->idx] = 0;
                parser->cont = false;
-        } else {
+        } else if (parser->idx < parser->size - 1) {
                parser->cont = true;
                parser->buffer[parser->idx++] = ch;
+        } else {
+                ret = -EINVAL;
+                goto out;
        }
        *ppos += read;
@@ -1261,21 +1284,6 @@ int is_tracing_stopped(void)
 }
 /**
- * ftrace_off_permanent - disable all ftrace code permanently
- *
- * This should only be called when a serious anomally has
- * been detected.  This will turn off the function tracing,
- * ring buffers, and other tracing utilites. It takes no
- * locks and can be called from any context.
- */
-void ftrace_off_permanent(void)
-{
-        tracing_disabled = 1;
-        ftrace_stop();
-        tracing_off_permanent();
-}
-/**
 * tracing_start - quick start of the tracer
 *
 * If tracing is enabled but was stopped by tracing_stop,
@@ -1509,7 +1517,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 #endif
                ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
                ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
-                (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+                (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+                (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
@@ -1630,7 +1639,7 @@ trace_function(struct trace_array *tr,
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
 }
@@ -1714,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
        entry->size = trace.nr_entries;
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
 out:
@@ -1816,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        trace.entries           = entry->caller;
        save_stack_trace_user(&trace);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
 out_drop_count:
@@ -2008,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
        memcpy(entry->buf, tbuffer, sizeof(u32) * len);
-        if (!filter_check_discard(call, entry, buffer, event)) {
+        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
        }
@@ -2063,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
        memcpy(&entry->buf, tbuffer, len);
        entry->buf[len] = '\0';
-        if (!filter_check_discard(call, entry, buffer, event)) {
+        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
        }
@@ -2760,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m)
        seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
        seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
        seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
-        seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
+        seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
        seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
        seq_printf(m, "#                       is not a '0' or '1')\n");
 }
@@ -2964,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
        return 0;
 }
+bool tracing_is_disabled(void)
+{
+        return (tracing_disabled) ? true: false;
+}
 /*
 * Open and update trace_array ref count.
 * Must have the current trace_array passed to it.
@@ -5454,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = {
        .func                   = ftrace_trace_snapshot_callback,
 };
-static int register_snapshot_cmd(void)
+static __init int register_snapshot_cmd(void)
 {
        return register_ftrace_command(&ftrace_snapshot_cmd);
 }
 #else
-static inline int register_snapshot_cmd(void) { return 0; }
+static inline __init int register_snapshot_cmd(void) { return 0; }
 #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
 struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
@@ -6253,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter)
        iter->trace = iter->tr->current_trace;
        iter->cpu_file = RING_BUFFER_ALL_CPUS;
        iter->trace_buffer = &global_trace.trace_buffer;
+        if (iter->trace && iter->trace->open)
+                iter->trace->open(iter);
+        /* Annotate start of buffers if we had overruns */
+        if (ring_buffer_overruns(iter->trace_buffer->buffer))
+                iter->iter_flags |= TRACE_FILE_ANNOTATE;
+        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+        if (trace_clocks[iter->tr->clock_id].in_ns)
+                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 }
 void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10c86fb7a2b4..ea189e027b80 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
        TRACE_FLAG_NEED_RESCHED         = 0x04,
        TRACE_FLAG_HARDIRQ              = 0x08,
        TRACE_FLAG_SOFTIRQ              = 0x10,
+        TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
 };
 #define TRACE_BUF_SIZE          1024
@@ -192,8 +193,8 @@ struct trace_array {
 #ifdef CONFIG_FTRACE_SYSCALLS
        int                     sys_refcount_enter;
        int                     sys_refcount_exit;
-        DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+        struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
-        DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+        struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
 #endif
        int                     stop_count;
        int                     clock_id;
@@ -514,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
 void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+bool tracing_is_disabled(void);
 struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
                                 struct dentry *parent,
@@ -711,6 +713,8 @@ extern unsigned long trace_flags;
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+#define TRACE_GRAPH_PRINT_FILL_SHIFT    28
+#define TRACE_GRAPH_PRINT_FILL_MASK     (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
 extern enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags);
@@ -730,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr,
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS          32
-extern int ftrace_graph_filter_enabled;
 extern int ftrace_graph_count;
 extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern int ftrace_graph_notrace_count;
+extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
 static inline int ftrace_graph_addr(unsigned long addr)
 {
        int i;
-        if (!ftrace_graph_filter_enabled)
+        if (!ftrace_graph_count)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
@@ -758,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr)
        return 0;
 }
+static inline int ftrace_graph_notrace_addr(unsigned long addr)
+{
+        int i;
+        if (!ftrace_graph_notrace_count)
+                return 0;
+        for (i = 0; i < ftrace_graph_notrace_count; i++) {
+                if (addr == ftrace_graph_notrace_funcs[i])
+                        return 1;
+        }
+        return 0;
+}
 #else
 static inline int ftrace_graph_addr(unsigned long addr)
 {
        return 1;
 }
+static inline int ftrace_graph_notrace_addr(unsigned long addr)
+{
+        return 0;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
@@ -986,9 +1011,9 @@ struct filter_pred {
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_call *call,
+extern void print_event_filter(struct ftrace_event_file *file,
                               struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_call *call,
+extern int apply_event_filter(struct ftrace_event_file *file,
                              char *filter_string);
 extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
                                        char *filter_string);
@@ -999,20 +1024,6 @@ extern int filter_assign_type(const char *type);
 struct ftrace_event_field *
 trace_find_event_field(struct ftrace_event_call *call, char *name);
-static inline int
-filter_check_discard(struct ftrace_event_call *call, void *rec,
-                     struct ring_buffer *buffer,
-                     struct ring_buffer_event *event)
-{
-        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
-            !filter_match_preds(call->filter, rec)) {
-                ring_buffer_discard_commit(buffer, event);
-                return 1;
-        }
-        return 0;
-}
 extern void trace_event_enable_cmd_record(bool enable);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 extern int event_trace_del_tracer(struct trace_array *tr);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index d594da0dc03c..697fb9bac8f0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        entry->line = f->line;
        entry->correct = val == expect;
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
 out:
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bcf66e8..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,9 +24,15 @@ static int	total_ref_count;
 static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
+        if (tp_event->perf_perm) {
+                int ret = tp_event->perf_perm(tp_event, p_event);
+                if (ret)
+                        return ret;
+        }
        /* The ftrace function trace is allowed only for root. */
        if (ftrace_event_is_function(tp_event) &&
-            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+            perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
                return -EPERM;
        /* No tracing, just counting, so no obvious leak */
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 int perf_trace_init(struct perf_event *p_event)
 {
        struct ftrace_event_call *tp_event;
-        int event_id = p_event->attr.config;
+        u64 event_id = p_event->attr.config;
        int ret = -EINVAL;
        mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 368a4d50cc30..a11800ae96de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -989,7 +989,7 @@ static ssize_t
 event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_file *file;
        struct trace_seq *s;
        int r = -ENODEV;
@@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
        mutex_lock(&event_mutex);
-        call = event_file_data(filp);
+        file = event_file_data(filp);
-        if (call)
+        if (file)
-                print_event_filter(call, s);
+                print_event_filter(file, s);
        mutex_unlock(&event_mutex);
-        if (call)
+        if (file)
                r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
        kfree(s);
@@ -1021,7 +1021,7 @@ static ssize_t
 event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_file *file;
        char *buf;
        int err = -ENODEV;
@@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        buf[cnt] = '\0';
        mutex_lock(&event_mutex);
-        call = event_file_data(filp);
+        file = event_file_data(filp);
-        if (call)
+        if (file)
-                err = apply_event_filter(call, buf);
+                err = apply_event_filter(file, buf);
        mutex_unlock(&event_mutex);
        free_page((unsigned long) buf);
@@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
        struct trace_array *tr;
        int ret;
+        if (tracing_is_disabled())
+                return -ENODEV;
        /* Make sure the system still exists */
        mutex_lock(&trace_types_lock);
        mutex_lock(&event_mutex);
@@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp)
        struct trace_array *tr = inode->i_private;
        int ret;
+        if (tracing_is_disabled())
+                return -ENODEV;
        if (trace_array_get(tr) < 0)
                return -ENODEV;
@@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp)
        if (ret < 0) {
                trace_array_put(tr);
                kfree(dir);
+                return ret;
        }
        filp->private_data = dir;
-        return ret;
+        return 0;
 }
 static int subsystem_release(struct inode *inode, struct file *file)
@@ -1539,7 +1546,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
                        return -1;
                }
        }
-        trace_create_file("filter", 0644, file->dir, call,
+        trace_create_file("filter", 0644, file->dir, file,
                          &ftrace_event_filter_fops);
        trace_create_file("format", 0444, file->dir, call,
@@ -1577,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call)
                if (file->event_call != call)
                        continue;
                ftrace_event_enable_disable(file, 0);
+                destroy_preds(file);
                /*
                 * The do_for_each_event_file() is
                 * a double loop. After finding the call for this
@@ -1700,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
        event_remove(call);
        trace_destroy_fields(call);
-        destroy_preds(call);
+        destroy_call_preds(call);
 }
 static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -2306,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr)
        /* Disable any running events */
        __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+        /* Access to events are within rcu_read_lock_sched() */
+        synchronize_sched();
        down_write(&trace_event_sem);
        __trace_remove_event_dirs(tr);
        debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 97daa8cf958d..2468f56dc5db 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps,
        free_page((unsigned long) buf);
 }
+static inline struct event_filter *event_filter(struct ftrace_event_file *file)
+{
+        if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                return file->event_call->filter;
+        else
+                return file->filter;
+}
 /* caller must hold event_mutex */
-void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
 {
-        struct event_filter *filter = call->filter;
+        struct event_filter *filter = event_filter(file);
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
@@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter)
        filter->n_preds = 0;
 }
-static void filter_disable(struct ftrace_event_call *call)
+static void call_filter_disable(struct ftrace_event_call *call)
 {
        call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
+static void filter_disable(struct ftrace_event_file *file)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                call_filter_disable(call);
+        else
+                file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+}
 static void __free_filter(struct event_filter *filter)
 {
        if (!filter)
@@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter)
        kfree(filter);
 }
+void destroy_call_preds(struct ftrace_event_call *call)
+{
+        __free_filter(call->filter);
+        call->filter = NULL;
+}
+static void destroy_file_preds(struct ftrace_event_file *file)
+{
+        __free_filter(file->filter);
+        file->filter = NULL;
+}
 /*
- * Called when destroying the ftrace_event_call.
+ * Called when destroying the ftrace_event_file.
- * The call is being freed, so we do not need to worry about
+ * The file is being freed, so we do not need to worry about
- * the call being currently used. This is for module code removing
+ * the file being currently used. This is for module code removing
 * the tracepoints from within it.
 */
-void destroy_preds(struct ftrace_event_call *call)
+void destroy_preds(struct ftrace_event_file *file)
 {
-        __free_filter(call->filter);
+        if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-        call->filter = NULL;
+                destroy_call_preds(file->event_call);
+        else
+                destroy_file_preds(file);
 }
 static struct event_filter *__alloc_filter(void)
@@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
        return 0;
 }
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static inline void __remove_filter(struct ftrace_event_file *file)
 {
+        struct ftrace_event_call *call = file->event_call;
+        filter_disable(file);
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                remove_filter_string(call->filter);
+        else
+                remove_filter_string(file->filter);
+}
+static void filter_free_subsystem_preds(struct event_subsystem *system,
+                                        struct trace_array *tr)
+{
+        struct ftrace_event_file *file;
        struct ftrace_event_call *call;
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                filter_disable(call);
+                __remove_filter(file);
-                remove_filter_string(call->filter);
        }
 }
-static void filter_free_subsystem_filters(struct event_subsystem *system)
+static inline void __free_subsystem_filter(struct ftrace_event_file *file)
 {
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
+                __free_filter(call->filter);
+                call->filter = NULL;
+        } else {
+                __free_filter(file->filter);
+                file->filter = NULL;
+        }
+}
+static void filter_free_subsystem_filters(struct event_subsystem *system,
+                                          struct trace_array *tr)
+{
+        struct ftrace_event_file *file;
        struct ftrace_event_call *call;
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                __free_filter(call->filter);
+                __free_subsystem_filter(file);
-                call->filter = NULL;
        }
 }
@@ -1617,15 +1677,85 @@ fail:
        return err;
 }
+static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                call->flags |= TRACE_EVENT_FL_FILTERED;
+        else
+                file->flags |= FTRACE_EVENT_FL_FILTERED;
+}
+static inline void event_set_filter(struct ftrace_event_file *file,
+                                    struct event_filter *filter)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                rcu_assign_pointer(call->filter, filter);
+        else
+                rcu_assign_pointer(file->filter, filter);
+}
+static inline void event_clear_filter(struct ftrace_event_file *file)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                RCU_INIT_POINTER(call->filter, NULL);
+        else
+                RCU_INIT_POINTER(file->filter, NULL);
+}
+static inline void
+event_set_no_set_filter_flag(struct ftrace_event_file *file)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+        else
+                file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+}
+static inline void
+event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+                call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
+        else
+                file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+}
+static inline bool
+event_no_set_filter_flag(struct ftrace_event_file *file)
+{
+        struct ftrace_event_call *call = file->event_call;
+        if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+                return true;
+        if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
+            (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
+                return true;
+        return false;
+}
 struct filter_list {
        struct list_head        list;
        struct event_filter     *filter;
 };
 static int replace_system_preds(struct event_subsystem *system,
+                                struct trace_array *tr,
                                struct filter_parse_state *ps,
                                char *filter_string)
 {
+        struct ftrace_event_file *file;
        struct ftrace_event_call *call;
        struct filter_list *filter_item;
        struct filter_list *tmp;
@@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system,
        bool fail = true;
        int err;
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
@@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system,
                 */
                err = replace_preds(call, NULL, ps, filter_string, true);
                if (err)
-                        call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+                        event_set_no_set_filter_flag(file);
                else
-                        call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
+                        event_clear_no_set_filter_flag(file);
        }
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
                struct event_filter *filter;
+                call = file->event_call;
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
+                if (event_no_set_filter_flag(file))
                        continue;
                filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
@@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system,
                err = replace_preds(call, filter, ps, filter_string, false);
                if (err) {
-                        filter_disable(call);
+                        filter_disable(file);
                        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
                        append_filter_err(ps, filter);
                } else
-                        call->flags |= TRACE_EVENT_FL_FILTERED;
+                        event_set_filtered_flag(file);
                /*
                 * Regardless of if this returned an error, we still
                 * replace the filter for the call.
                 */
-                filter = call->filter;
+                filter = event_filter(file);
-                rcu_assign_pointer(call->filter, filter_item->filter);
+                event_set_filter(file, filter_item->filter);
                filter_item->filter = filter;
                fail = false;
@@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call,
 * and always remembers @filter_str.
 */
 static int create_system_filter(struct event_subsystem *system,
+                                struct trace_array *tr,
                                char *filter_str, struct event_filter **filterp)
 {
        struct event_filter *filter = NULL;
@@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system,
        err = create_filter_start(filter_str, true, &ps, &filter);
        if (!err) {
-                err = replace_system_preds(system, ps, filter_str);
+                err = replace_system_preds(system, tr, ps, filter_str);
                if (!err) {
                        /* System filters just show a default message */
                        kfree(filter->filter_string);
@@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system,
 }
 /* caller must hold event_mutex */
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
 {
+        struct ftrace_event_call *call = file->event_call;
        struct event_filter *filter;
        int err;
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_disable(call);
+                filter_disable(file);
-                filter = call->filter;
+                filter = event_filter(file);
                if (!filter)
                        return 0;
-                RCU_INIT_POINTER(call->filter, NULL);
+                event_clear_filter(file);
                /* Make sure the filter is not being used */
                synchronize_sched();
                __free_filter(filter);
                return 0;
        }
@@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
         * string
         */
        if (filter) {
-                struct event_filter *tmp = call->filter;
+                struct event_filter *tmp;
+                tmp = event_filter(file);
                if (!err)
-                        call->flags |= TRACE_EVENT_FL_FILTERED;
+                        event_set_filtered_flag(file);
                else
-                        filter_disable(call);
+                        filter_disable(file);
-                rcu_assign_pointer(call->filter, filter);
+                event_set_filter(file, filter);
                if (tmp) {
                        /* Make sure the call is done with the filter */
@@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
                                 char *filter_string)
 {
        struct event_subsystem *system = dir->subsystem;
+        struct trace_array *tr = dir->tr;
        struct event_filter *filter;
        int err = 0;
@@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
        }
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_free_subsystem_preds(system);
+                filter_free_subsystem_preds(system, tr);
                remove_filter_string(system->filter);
                filter = system->filter;
                system->filter = NULL;
                /* Ensure all filters are no longer used */
                synchronize_sched();
-                filter_free_subsystem_filters(system);
+                filter_free_subsystem_filters(system, tr);
                __free_filter(filter);
                goto out_unlock;
        }
-        err = create_system_filter(system, filter_string, &filter);
+        err = create_system_filter(system, tr, filter_string, &filter);
        if (filter) {
                /*
                 * No event actually uses the system filter
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d21a74670088..7c3e3e72e2b6 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = {			\
        .event.type             = etype,                                \
        .class                  = &event_class_ftrace_##call,           \
        .print_fmt              = print,                                \
-        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE,         \
+        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
 };                                                                      \
 struct ftrace_event_call __used                                         \
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b5c09242683d..0b99120d395c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -82,9 +82,9 @@ static struct trace_array *graph_array;
 * to fill in space into DURATION column.
 */
 enum {
-        DURATION_FILL_FULL  = -1,
+        FLAGS_FILL_FULL  = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT,
-        DURATION_FILL_START = -2,
+        FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT,
-        DURATION_FILL_END   = -3,
+        FLAGS_FILL_END   = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
 };
 static enum print_line_t
@@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
                return -EBUSY;
        }
+        /*
+         * The curr_ret_stack is an index to ftrace return stack of
+         * current task.  Its value should be in [0, FTRACE_RETFUNC_
+         * DEPTH) when the function graph tracer is used.  To support
+         * filtering out specific functions, it makes the index
+         * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
+         * so when it sees a negative index the ftrace will ignore
+         * the record.  And the index gets recovered when returning
+         * from the filtered function by adding the FTRACE_NOTRACE_
+         * DEPTH and then it'll continue to record functions normally.
+         *
+         * The curr_ret_stack is initialized to -1 and get increased
+         * in this function.  So it can be less than -1 only if it was
+         * filtered out via ftrace_graph_notrace_addr() which can be
+         * set from set_graph_notrace file in debugfs by user.
+         */
+        if (current->curr_ret_stack < -1)
+                return -EBUSY;
        calltime = trace_clock_local();
        index = ++current->curr_ret_stack;
+        if (ftrace_graph_notrace_addr(func))
+                current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
        barrier();
        current->ret_stack[index].ret = ret;
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
        current->ret_stack[index].subtime = 0;
        current->ret_stack[index].fp = frame_pointer;
-        *depth = index;
+        *depth = current->curr_ret_stack;
        return 0;
 }
@@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
        index = current->curr_ret_stack;
-        if (unlikely(index < 0)) {
+        /*
+         * A negative index here means that it's just returned from a
+         * notrace'd function.  Recover index to get an original
+         * return address.  See ftrace_push_return_trace().
+         *
+         * TODO: Need to check whether the stack gets corrupted.
+         */
+        if (index < 0)
+                index += FTRACE_NOTRACE_DEPTH;
+        if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
                ftrace_graph_stop();
                WARN_ON(1);
                /* Might as well panic, otherwise we have no where to go */
@@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        trace.rettime = trace_clock_local();
        barrier();
        current->curr_ret_stack--;
+        /*
+         * The curr_ret_stack can be less than -1 only if it was
+         * filtered out and it's about to return from the function.
+         * Recover the index and continue to trace normal functions.
+         */
+        if (current->curr_ret_stack < -1) {
+                current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
+                return ret;
+        }
        /*
         * The trace should run after decrementing the ret counter
@@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr,
                return 0;
        entry   = ring_buffer_event_data(event);
        entry->graph_ent                        = *trace;
-        if (!filter_current_check_discard(buffer, call, entry, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
        return 1;
@@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        /* trace it when it is-nested-in or is a function enabled. */
        if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
-             ftrace_graph_ignore_irqs()) ||
+             ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
            (max_depth && trace->depth >= max_depth))
                return 0;
+        /*
+         * Do not trace a function if it's filtered by set_graph_notrace.
+         * Make the index of ret stack negative to indicate that it should
+         * ignore further functions.  But it needs its own ret stack entry
+         * to recover the original index in order to continue tracing after
+         * returning from the function.
+         */
+        if (ftrace_graph_notrace_addr(trace->func))
+                return 1;
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr,
                return;
        entry   = ring_buffer_event_data(event);
        entry->ret                              = *trace;
-        if (!filter_current_check_discard(buffer, call, entry, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
 }
@@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        }
        /* No overhead */
-        ret = print_graph_duration(DURATION_FILL_START, s, flags);
+        ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
        if (ret != TRACE_TYPE_HANDLED)
                return ret;
@@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
-        ret = print_graph_duration(DURATION_FILL_END, s, flags);
+        ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
        if (ret != TRACE_TYPE_HANDLED)
                return ret;
@@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
                        return TRACE_TYPE_HANDLED;
        /* No real adata, just filling the column with spaces */
-        switch (duration) {
+        switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
-        case DURATION_FILL_FULL:
+        case FLAGS_FILL_FULL:
                ret = trace_seq_puts(s, "              |  ");
                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-        case DURATION_FILL_START:
+        case FLAGS_FILL_START:
                ret = trace_seq_puts(s, "  ");
                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-        case DURATION_FILL_END:
+        case FLAGS_FILL_END:
                ret = trace_seq_puts(s, " |");
                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
        }
@@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
        }
        /* No time */
-        ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
+        ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
        if (ret != TRACE_TYPE_HANDLED)
                return ret;
@@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
+        ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
        if (ret != TRACE_TYPE_HANDLED)
                return ret;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 243f6834d026..dae9541ada9e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
        entry->ip = (unsigned long)tp->rp.kp.addr;
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-        if (!filter_current_check_discard(buffer, call, entry, event))
+        if (!filter_check_discard(ftrace_file, entry, buffer, event))
                trace_buffer_unlock_commit_regs(buffer, event,
                                                irq_flags, pc, regs);
 }
@@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
        entry->ret_ip = (unsigned long)ri->ret_addr;
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-        if (!filter_current_check_discard(buffer, call, entry, event))
+        if (!filter_check_discard(ftrace_file, entry, buffer, event))
                trace_buffer_unlock_commit_regs(buffer, event,
                                                irq_flags, pc, regs);
 }
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b3dcfb2f0fef..0abd9b863474 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->rw                       = *rw;
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit(buffer, event, 0, pc);
 }
@@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->map                      = *map;
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit(buffer, event, 0, pc);
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cbac0c9c..ed32284fbe32 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
                (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
                (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
                '.';
-        need_resched =
-                (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+        switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+                                TRACE_FLAG_PREEMPT_RESCHED)) {
+        case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
+                need_resched = 'N';
+                break;
+        case TRACE_FLAG_NEED_RESCHED:
+                need_resched = 'n';
+                break;
+        case TRACE_FLAG_PREEMPT_RESCHED:
+                need_resched = 'p';
+                break;
+        default:
+                need_resched = '.';
+                break;
+        }
        hardsoft_irq =
                (hardirq && softirq) ? 'H' :
                hardirq ? 'h' :
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4e98e3b257a3..3f34dc9b40f3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry->next_state               = next->state;
        entry->next_cpu = task_cpu(next);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit(buffer, event, flags, pc);
 }
@@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->next_state               = wakee->state;
        entry->next_cpu                 = task_cpu(wakee);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit(buffer, event, flags, pc);
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 847f88a6194b..7af67360b330 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 /* The root directory for all stat files */
 static struct dentry            *stat_dir;
-/*
+static void __reset_stat_session(struct stat_session *session)
- * Iterate through the rbtree using a post order traversal path
- * to release the next node.
- * It won't necessary release one at each iteration
- * but it will at least advance closer to the next one
- * to be released.
- */
-static struct rb_node *release_next(struct tracer_stat *ts,
-                                    struct rb_node *node)
 {
-        struct stat_node *snode;
+        struct stat_node *snode, *n;
-        struct rb_node *parent = rb_parent(node);
-        if (node->rb_left)
-                return node->rb_left;
-        else if (node->rb_right)
-                return node->rb_right;
-        else {
-                if (!parent)
-                        ;
-                else if (parent->rb_left == node)
-                        parent->rb_left = NULL;
-                else
-                        parent->rb_right = NULL;
-                snode = container_of(node, struct stat_node, node);
+        rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) {
-                if (ts->stat_release)
+                if (session->ts->stat_release)
-                        ts->stat_release(snode->stat);
+                        session->ts->stat_release(snode->stat);
                kfree(snode);
-                return parent;
        }
-}
-static void __reset_stat_session(struct stat_session *session)
-{
-        struct rb_node *node = session->stat_root.rb_node;
-        while (node)
-                node = release_next(session->ts, node);
        session->stat_root = RB_ROOT;
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 559329d9bd2f..ea90eb5f6f17 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
        struct trace_array *tr = data;
+        struct ftrace_event_file *ftrace_file;
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
@@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
-        if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
+        /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
+        ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+        if (!ftrace_file)
+                return;
+        if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        entry->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
-        if (!filter_current_check_discard(buffer, sys_data->enter_event,
+        if (!filter_check_discard(ftrace_file, entry, buffer, event))
-                                          entry, event))
                trace_current_buffer_unlock_commit(buffer, event,
                                                   irq_flags, pc);
 }
@@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 {
        struct trace_array *tr = data;
+        struct ftrace_event_file *ftrace_file;
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
@@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
-        if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
+        /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
+        ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+        if (!ftrace_file)
+                return;
+        if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        entry->nr = syscall_nr;
        entry->ret = syscall_get_return_value(current, regs);
-        if (!filter_current_check_discard(buffer, sys_data->exit_event,
+        if (!filter_check_discard(ftrace_file, entry, buffer, event))
-                                          entry, event))
                trace_current_buffer_unlock_commit(buffer, event,
                                                   irq_flags, pc);
 }
@@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
        if (!tr->sys_refcount_enter)
                ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
        if (!ret) {
-                set_bit(num, tr->enabled_enter_syscalls);
+                rcu_assign_pointer(tr->enter_syscall_files[num], file);
                tr->sys_refcount_enter++;
        }
        mutex_unlock(&syscall_trace_lock);
@@ -415,7 +427,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
                return;
        mutex_lock(&syscall_trace_lock);
        tr->sys_refcount_enter--;
-        clear_bit(num, tr->enabled_enter_syscalls);
+        rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
        if (!tr->sys_refcount_enter)
                unregister_trace_sys_enter(ftrace_syscall_enter, tr);
        mutex_unlock(&syscall_trace_lock);
@@ -435,7 +447,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
        if (!tr->sys_refcount_exit)
                ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
        if (!ret) {
-                set_bit(num, tr->enabled_exit_syscalls);
+                rcu_assign_pointer(tr->exit_syscall_files[num], file);
                tr->sys_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
@@ -453,7 +465,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
                return;
        mutex_lock(&syscall_trace_lock);
        tr->sys_refcount_exit--;
-        clear_bit(num, tr->enabled_exit_syscalls);
+        rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
        if (!tr->sys_refcount_exit)
                unregister_trace_sys_exit(ftrace_syscall_exit, tr);
        mutex_unlock(&syscall_trace_lock);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 272261b5f94f..b6dcc42ef7f5 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
        if (is_ret)
                tu->consumer.ret_handler = uretprobe_dispatcher;
        init_trace_uprobe_filter(&tu->filter);
+        tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
        return tu;
 error:
@@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
-        if (!filter_current_check_discard(buffer, call, entry, event))
+        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit(buffer, event, 0, 0);
 }
diff --git a/kernel/up.c b/kernel/up.c
index 630d72bf7e41..509403e3fbc6 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
+                                int wait)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        csd->func(csd->info);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__smp_call_function_single);
 int on_each_cpu(smp_call_func_t func, void *info, int wait)
 {
        unsigned long flags;
diff --git a/kernel/user.c b/kernel/user.c
index 5bbb91988e69..c006131beb77 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,10 @@ struct user_namespace init_user_ns = {
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+        .persistent_keyring_register_sem =
+        __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+#endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 13fb1134ba58..240fb62cf394 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,6 +101,9 @@ int create_user_ns(struct cred *new)
        set_cred_user_ns(new, ns);
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+        init_rwsem(&ns->persistent_keyring_register_sem);
+#endif
        return 0;
 }
@@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns)
        do {
                parent = ns->parent;
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+                key_put(ns->persistent_keyring_register);
+#endif
                proc_free_inum(ns->proc_inum);
                kmem_cache_free(user_ns_cachep, ns);
                ns = parent;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..b010eac595d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 /* I: attributes used when instantiating standard unbound pools on demand */
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
 struct workqueue_struct *system_wq __read_mostly;
 EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
 #endif
-/* allocate ID and assign it to @pool */
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
 static int worker_pool_assign_id(struct worker_pool *pool)
 {
        int ret;
        lockdep_assert_held(&wq_pool_mutex);
-        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        /* if dying, only works from the same workqueue are allowed */
+        /* if draining, only works from the same workqueue are allowed */
        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (IS_ERR(worker->task))
                goto fail;
+        set_user_nice(worker->task, pool->attrs->nice);
+        /* prevent userland from meddling with cpumask of workqueue workers */
+        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
         * online CPUs.  It'll be re-applied when any of the CPUs come up.
         */
-        set_user_nice(worker->task, pool->attrs->nice);
        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-        /* prevent userland from meddling with cpumask of workqueue workers */
-        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * The caller is responsible for ensuring %POOL_DISASSOCIATED
         * remains stable across this function.  See the comments above the
@@ -2840,19 +2851,6 @@ already_gone:
        return false;
 }
-static bool __flush_work(struct work_struct *work)
-{
-        struct wq_barrier barr;
-        if (start_flush_work(work, &barr)) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-                return true;
-        } else {
-                return false;
-        }
-}
 /**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work)
 */
 bool flush_work(struct work_struct *work)
 {
+        struct wq_barrier barr;
        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);
-        return __flush_work(work);
+        if (start_flush_work(work, &barr)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else {
+                return false;
+        }
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -4106,7 +4112,7 @@ out_unlock:
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
        bool highpri = wq->flags & WQ_HIGHPRI;
-        int cpu;
+        int cpu, ret;
        if (!(wq->flags & WQ_UNBOUND)) {
                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
                        mutex_unlock(&wq->mutex);
                }
                return 0;
+        } else if (wq->flags & __WQ_ORDERED) {
+                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+                /* there should only be single pwq for ordering guarantee */
+                WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+                              wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+                     "ordering guarantee broken for workqueue %s\n", wq->name);
+                return ret;
        } else {
                return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
@@ -4814,14 +4827,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
        schedule_work_on(cpu, &wfc.work);
+        flush_work(&wfc.work);
-        /*
-         * The work item is on-stack and can't lead to deadlock through
-         * flushing.  Use __flush_work() to avoid spurious lockdep warnings
-         * when work_on_cpu()s are nested.
-         */
-        __flush_work(&wfc.work);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -5009,10 +5015,6 @@ static int __init init_workqueues(void)
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        int i, cpu;
-        /* make sure we have enough bits for OFFQ pool ID */
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                     WORK_CPU_END * NR_STD_WORKER_POOLS);
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5053,23 @@ static int __init init_workqueues(void)
                }
        }
-        /* create default unbound wq attrs */
+        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;
                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;
+                /*
+                 * An ordered wq should have only one pwq as ordering is
+                 * guaranteed by max_active which is enforced by pwqs.
+                 * Turn off NUMA so that dfl_pwq is used for all nodes.
+                 */
+                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+                attrs->nice = std_nice[i];
+                attrs->no_numa = true;
+                ordered_wq_attrs[i] = attrs;
        }
        system_wq = alloc_workqueue("events", 0, 0);
author	Eric Paris <eparis@redhat.com>	2014-03-07 11:41:32 -0500
committer	Eric Paris <eparis@redhat.com>	2014-03-07 11:41:32 -0500
commit	b7d3622a39fde7658170b7f3cf6c6889bb8db30d (patch)
tree	64f4e781ecb2a85d675e234072b988560bcd25f1 /kernel
parent	f3411cb2b2e396a41ed3a439863f028db7140a34 (diff)
parent	d8ec26d7f8287f5788a494f56e8814210f0e64be (diff)